From 638c8bd254a5e52cc23f6ea8c5be865274002e18 Mon Sep 17 00:00:00 2001 From: jakeross Date: Sat, 22 Nov 2025 18:24:02 -0700 Subject: [PATCH 01/66] refactor: pass metrics object to transfer functions for improved data handling --- transfers/transfer.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/transfers/transfer.py b/transfers/transfer.py index 77275ed35..f38bc4220 100644 --- a/transfers/transfer.py +++ b/transfers/transfer.py @@ -50,13 +50,12 @@ def message(msg, pad=10, new_line_at_top=True): @timeit -def transfer_all(sess, limit=100): +def transfer_all(sess, metrics, limit=100): message("STARTING TRANSFER", new_line_at_top=False) logger.info("Erase and rebuilding database") erase_and_rebuild_db() - metrics = Metrics() message("TRANSFERRING WELLS") flags = { @@ -125,14 +124,13 @@ def transfer_all(sess, limit=100): timeit_direct(transfer_assets, sess) -def transfer_debugging(sess, limit=100): +def transfer_debugging(sess, metrics, limit=100): message("STARTING TRANSFER DEBUG", new_line_at_top=False) if int(os.environ.get("ERASE_AND_REBUILD", 0)): logger.info("Erase and rebuilding database") erase_and_rebuild_db() - metrics = Metrics() message("TRANSFERRING WELLS") flags = {"TRANSFER_ALL_WELLS": True} @@ -205,12 +203,15 @@ def transfer_debugging(sess, limit=100): def main(): message("START--------------------------------------") limit = int(os.environ.get("TRANSFER_LIMIT", 1000)) + metrics = Metrics() with session_ctx() as sess: if int(os.environ.get("TRANSFER_DEBUG", 0)): - transfer_debugging(sess, limit=limit) + transfer_debugging(sess, metrics, limit=limit) else: - transfer_all(sess, limit=limit) + transfer_all(sess, metrics, limit=limit) + metrics.close() + metrics.save_to_storage_bucket() # todo: move the log file to a storage bucket save_log_to_bucket() message("END--------------------------------------") From 1c0936759210310276afc4a245c941487ac0b353 Mon Sep 17 00:00:00 2001 From: jakeross Date: Tue, 25 Nov 2025 15:55:22 -0700 Subject: [PATCH 02/66] refactor: enhance sensor transfer process with recording interval estimation and chunked transfers --- transfers/sensor_transfer.py | 60 ++- transfers/transfer.py | 6 +- transfers/util.py | 64 ++- transfers/well_transfer.py | 731 +++++++++++++++++++++++++++-------- 4 files changed, 669 insertions(+), 192 deletions(-) diff --git a/transfers/sensor_transfer.py b/transfers/sensor_transfer.py index f6ff49dcb..90e7273f1 100644 --- a/transfers/sensor_transfer.py +++ b/transfers/sensor_transfer.py @@ -18,7 +18,13 @@ from sqlalchemy import select from db import Sensor, Deployment, Thing -from transfers.util import read_csv, logger, filter_to_valid_point_ids, replace_nans +from transfers.util import ( + read_csv, + logger, + filter_to_valid_point_ids, + replace_nans, + RecordingIntervalEstimator, +) EQUIPMENT_TO_SENSOR_TYPE_MAP = { "Pressure transducer": "Pressure Transducer", @@ -37,6 +43,7 @@ def transfer_sensors(session): errors = [] grouped_equipment = cleaned_df.groupby(["PointID"]) added = {} + estimators = {} for index, group in grouped_equipment: pointid = index[0] thing = session.query(Thing).filter(Thing.name == pointid).first() @@ -127,23 +134,43 @@ def transfer_sensors(session): row.DateRemoved, "%Y-%m-%d %H:%M:%S.%f" ).date() + recording_interval_unit = "hour" try: recording_interval = int(row.RecordingInterval) except (ValueError, TypeError): - logger.critical( - f"name={sensor.name}, serial_no={sensor.serial_no} RecordingInterval is not an " - f"integer. Setting to None" - ) - recording_interval = None - errors.append( - { - "pointid": pointid, - "error": f"row.ID={row.ID}, row.SerialNo={row.SerialNo}. RecordingInterval is " - f"not an integer", - "table": source_table, - "field": "RecordingInterval", - } + + # try to calculate recording interval from measurements + if sensor_type in estimators: + estimator = estimators[sensor_type] + else: + estimator = RecordingIntervalEstimator(sensor_type) + estimators[sensor_type] = estimator + + recording_interval, unit = estimator.estimate_recording_interval( + row, installation_date, removal_date ) + + if recording_interval: + recording_interval_unit = unit + logger.info( + f"name={sensor.name}, serial_no={sensor.serial_no}. " + f"estimated recording interval: {recording_interval} " + ) + else: + + logger.critical( + f"name={sensor.name}, serial_no={sensor.serial_no} RecordingInterval is not an integer" + ) + + errors.append( + { + "pointid": pointid, + "error": f"row.ID={row.ID}, row.SerialNo={row.SerialNo}. RecordingInterval is " + f"not an integer", + "table": source_table, + "field": "RecordingInterval", + } + ) sql = ( select(Deployment) .join(Thing) @@ -166,7 +193,7 @@ def transfer_sensors(session): installation_date=installation_date, removal_date=removal_date, recording_interval=recording_interval, - recording_interval_units="hour", + recording_interval_units=recording_interval_unit, hanging_cable_length=row.HangingCableLength, hanging_point_height=row.HangingPointHgt, hanging_point_description=row.HangingPointDescription, @@ -189,6 +216,9 @@ def transfer_sensors(session): sensor.sensor_status = "Retired" session.commit() except Exception as e: + import traceback + + traceback.print_exc() logger.critical(f"Could not add sensor and deployment: {e}") errors.append({"pointid": pointid, "error": e, "table": source_table}) diff --git a/transfers/transfer.py b/transfers/transfer.py index f38bc4220..2098a85ea 100644 --- a/transfers/transfer.py +++ b/transfers/transfer.py @@ -133,13 +133,13 @@ def transfer_debugging(sess, metrics, limit=100): message("TRANSFERRING WELLS") - flags = {"TRANSFER_ALL_WELLS": True} + flags = {"TRANSFER_ALL_WELLS": True, "LIMIT": limit} - results = timeit_direct(transfer_wells, sess, flags=flags, limit=limit) + results = timeit_direct(transfer_wells, flags=flags) metrics.well_metrics(sess, *results) message("TRANSFERRING WELL SCREENS") - results = timeit_direct(transfer_wellscreens, sess) + results = timeit_direct(transfer_wellscreens, flags=flags) metrics.well_screen_metrics(sess, *results) message("TRANSFERRING SENSORS") diff --git a/transfers/util.py b/transfers/util.py index cbf0f2b17..31ad32e0a 100644 --- a/transfers/util.py +++ b/transfers/util.py @@ -53,6 +53,59 @@ } +class RecordingIntervalEstimator: + def __init__(self, sensor_type: str): + if sensor_type == "Pressure Transducer": + self._df = read_csv("WaterLevelsContinuous_Pressure") + else: + self._df = read_csv("WaterLevelsContinuous_Acoustic") + + # convert "DateMeasured" to date" + self._df["DateMeasured"] = pd.to_datetime(self._df["DateMeasured"]).dt.date + + def estimate_recording_interval( + self, + record: pd.Series, + installation_date: datetime = None, + removal_date: datetime = None, + ): + point_id = record.PointID + + cdf = self._df[self._df["PointID"] == point_id] + if len(cdf) == 0: + return None, None + + cdf = cdf.sort_values("DateMeasured") + if installation_date is not None: + cdf = cdf[cdf["DateMeasured"] >= installation_date] + if removal_date is not None: + cdf = cdf[cdf["DateMeasured"] <= removal_date] + + # calculate the average interval in seconds + try: + date_series = pd.to_datetime(cdf["DateMeasured"]) + intervals = date_series.diff().dropna().dt.total_seconds() + if len(intervals) == 0: + avg_interval = None + else: + avg_interval = intervals.mean() + except IndexError: + return None, None + + # convert to hours + avg_interval /= 3600 + + unit = "hour" + if avg_interval < 1: + avg_interval *= 60 + unit = "minute" + if avg_interval < 1: + avg_interval *= 60 + unit = "second" + + return int(avg_interval), unit + + def replace_nans(df: pd.DataFrame, default=None) -> pd.DataFrame: df = df.replace(pd.NA, default) return df.replace({np.nan: default}) @@ -127,11 +180,12 @@ def filter_by_welldata_datasource_and_project(df: pd.DataFrame) -> pd.DataFrame: reader = csv.reader(f) _ = next(reader) valid_datasources = [row[0] for row in reader if row[1] == "Yes"] - f.seek(0) - invalid_datasources = [row[0] for row in reader if row[1] == "NO"] - logger.info("Invalid WellData Datasources:") - for vd in invalid_datasources: - logger.info(f" {vd}") + + # f.seek(0) + # invalid_datasources = [row[0] for row in reader if row[1] == "NO"] + # logger.info("Invalid WellData Datasources:") + # for vd in invalid_datasources: + # logger.info(f" {vd}") counts = df.groupby("DataSource").size().reset_index(name="WellCount") counts = counts.sort_values("WellCount", ascending=False) diff --git a/transfers/well_transfer.py b/transfers/well_transfer.py index ee54d0216..bb393c17f 100644 --- a/transfers/well_transfer.py +++ b/transfers/well_transfer.py @@ -37,6 +37,7 @@ MonitoringFrequencyHistory, MeasuringPointHistory, ) +from db.engine import session_ctx from schemas.thing import CreateWell, CreateWellScreen from services.gcs_helper import get_storage_bucket from services.util import ( @@ -166,60 +167,98 @@ def dump_cached_elevations(lut: dict): blob.upload_from_string(json.dumps(lut)) -def transfer_wells(session: Session, flags: dict = None, limit: int = 0) -> None: - input_df, cleaned_df = get_wells_to_transfer(session, flags) +class Transferer(object): + input_df: pd.DataFrame = None + cleaned_df: pd.DataFrame = None + errors: list = None + flags: dict = None + + def __init__(self, flags: dict = None): + self.errors = [] + self.flags = flags if flags else {} + + def transfer(self): + with session_ctx() as session: + self.input_df, self.cleaned_df = self._get_dfs(session) + self._limit_iterator(session, self.flags.get("LIMIT", 0)) + + def _get_df_to_iterate(self) -> pd.DataFrame: + return self.cleaned_df + + def _limit_iterator(self, session: Session, limit: int, step: int = 25): + df = self._get_df_to_iterate() + n = len(df) + start_time = time.time() + for i, row in enumerate(df.itertuples()): + if limit and i >= limit: + logger.info(f"Reached limit of {limit} rows. Stopping migration.") + break + + if i and not i % step: + logger.info( + f"Processing row {i} of {n}, avg rows per second: {step / (time.time() - start_time):.2f}" + ) + start_time = time.time() + try: + session.commit() + except Exception as e: + logger.critical(f"Error committing wells. {e}") + session.rollback() + continue + + self._iterator(session, df, i, row) + + session.commit() + self._after_hook(session) + + def _iterator(self, session: Session, df: pd.DataFrame, i: int, row: dict): + raise NotImplementedError("Must implement _iterator method") + + def _after_hook(self, session: Session): + pass + + def _get_dfs(self, session: Session): + raise NotImplementedError("Must implement _get_dfs method") + + +class WellTransferer(Transferer): source_table = "WellData" - wdf = cleaned_df - n = len(wdf) - - step = 25 - start_time = time.time() - errors = [] - added_locations = {} - cached_elevations = get_cached_elevations() - for i, row in enumerate(wdf.itertuples()): + + def __init__(self, *args, **kw): + super().__init__(*args, **kw) + self._cached_elevations = get_cached_elevations() + self._added_locations = {} + + def _get_dfs(self, session: Session): + return get_wells_to_transfer(session, self.flags) + + def _iterator(self, session, df, i, row): pointid = row.PointID - if wdf[wdf["PointID"] == pointid].shape[0] > 1: + if df[df["PointID"] == pointid].shape[0] > 1: logger.critical( f"transfer_wells. PointID {pointid} has duplicate records. Skipping." ) - errors.append( + self.errors.append( { "pointid": pointid, "error": "duplicate records", - "table": source_table, + "table": self.source_table, "field": "PointID", } ) - continue - - if limit and i >= limit: - logger.info(f"Reached limit of {limit} rows. Stopping migration.") - break - - if i and not i % step: - logger.info( - f"Processing row {i} of {n}, avg rows per second: {step / (time.time() - start_time):.2f}" - ) - start_time = time.time() - try: - session.commit() - except Exception as e: - logger.critical(f"Error committing wells. {e}") - session.rollback() - continue + return location = None try: - location, elevation_method = make_location(row, cached_elevations) + location, elevation_method = make_location(row, self._cached_elevations) session.add(location) - added_locations[row.PointID] = elevation_method + self._added_locations[row.PointID] = elevation_method except Exception as e: if location is not None: session.expunge(location) # these rollbacks are cause an issue because they are discarding good data # session.rollback() - errors.append( + self.errors.append( { "pointid": row.PointID, "error": e, @@ -228,7 +267,7 @@ def transfer_wells(session: Session, flags: dict = None, limit: int = 0) -> None } ) logger.critical(f"Error making location for {row.PointID}: {e}") - continue + return try: first_visit_date = _get_first_visit_date(row) @@ -261,11 +300,13 @@ def transfer_wells(session: Session, flags: dict = None, limit: int = 0) -> None CreateWell.model_validate(data) except ValidationError as e: - errors.append({"pointid": row.PointID, "error": e, "table": "WellData"}) + self.errors.append( + {"pointid": row.PointID, "error": e, "table": "WellData"} + ) logger.critical( f"Validation error for row {i} with PointID {row.PointID}: {e.errors()}" ) - continue + return well = None try: @@ -320,9 +361,11 @@ def transfer_wells(session: Session, flags: dict = None, limit: int = 0) -> None if well is not None: session.expunge(well) - errors.append({"pointid": row.PointID, "error": e, "table": "WellData"}) + self.errors.append( + {"pointid": row.PointID, "error": e, "table": "WellData"} + ) logger.critical(f"Error creating well for {row.PointID}: {e}") - continue + return assoc = LocationThingAssociation(effective_start=location.created_at) @@ -330,155 +373,194 @@ def transfer_wells(session: Session, flags: dict = None, limit: int = 0) -> None assoc.thing = well session.add(assoc) - session.commit() + def _after_hook(self, session): + dump_cached_elevations(self._cached_elevations) - # add things thate need well id - for well in session.query(Thing).filter(Thing.thing_type == "water well").all(): - row = wdf[wdf["PointID"] == well.name].iloc[0] - if not isna(row.Notes): - note = well.add_note(row.Notes, "Other") - session.add(note) + # add things thate need well id + for well in session.query(Thing).filter(Thing.thing_type == "water well").all(): + row = self.cleaned_df[self.cleaned_df["PointID"] == well.name].iloc[0] + if not isna(row.Notes): + note = well.add_note(row.Notes, "Other") + session.add(note) - location = well.current_location - elevation_method = added_locations[row.PointID] - data_provenances = make_location_data_provenance( - row, location, elevation_method - ) - for dp in data_provenances: - session.add(dp) - - """ - Developer's note - - It's not clear when the measuring point from NM_Aquifer was - determined, so I'm setting start_date to the day of the transfer - """ - measuring_point_history = MeasuringPointHistory( - thing_id=well.id, - measuring_point_height=row.MPHeight, - measuring_point_description=row.MeasuringPoint, - start_date=datetime.now(tz=UTC), - end_date=None, - ) - session.add(measuring_point_history) - - """ - Developer's notes - - For all status_history records the start_date will be now since that - isn't recorded in NM_Aquifer - """ - # TODO: if row.MonitoringStatus == "Q" is it monitored or not? <-- AMMP review - # TODO: if row.MonitoringStatus == "X" can that change? <-- AMMP review - # TODO: have AMMP review and verify the various MonitoringStatus codes - - target_id = well.id - target_table = "thing" - if not isna(row.MonitoringStatus): - if ( - "X" in row.MonitoringStatus - or "I" in row.MonitoringStatus - or "C" in row.MonitoringStatus - ): - status_value = "Not currently monitored" - else: - status_value = "Currently monitored" - - status_history = StatusHistory( - status_type="Monitoring Status", - status_value=status_value, - reason=row.MonitorStatusReason, - start_date=datetime.now(tz=UTC), - target_id=target_id, - target_table=target_table, + location = well.current_location + elevation_method = self._added_locations[row.PointID] + data_provenances = make_location_data_provenance( + row, location, elevation_method ) - session.add(status_history) - logger.info( - f" Added monitoring status for well {well.name}: {status_value}" - ) - - for code in NMA_MONITORING_FREQUENCY.keys(): - if code in row.MonitoringStatus: - monitoring_frequency = NMA_MONITORING_FREQUENCY[code] - monitoring_frequency_history = MonitoringFrequencyHistory( - thing_id=well.id, - monitoring_frequency=monitoring_frequency, - start_date=datetime.now(tz=UTC), - end_date=None, - ) - session.add(monitoring_frequency_history) - logger.info( - f" Adding '{monitoring_frequency}' monitoring frequency for well {well.name}" - ) - - if not isna(row.Status): - status_value = lexicon_mapper.map_value(f"LU_Status:{row.Status}") - status_history = StatusHistory( - status_type="Well Status", - status_value=status_value, - reason=row.StatusUserNotes, + for dp in data_provenances: + session.add(dp) + + """ + Developer's note + + It's not clear when the measuring point from NM_Aquifer was + determined, so I'm setting start_date to the day of the transfer + """ + measuring_point_history = MeasuringPointHistory( + thing_id=well.id, + measuring_point_height=row.MPHeight, + measuring_point_description=row.MeasuringPoint, start_date=datetime.now(tz=UTC), - target_id=target_id, - target_table=target_table, + end_date=None, ) - session.add(status_history) - logger.info(f" Added well status for well {well.name}: {status_value}") + session.add(measuring_point_history) + + """ + Developer's notes + + For all status_history records the start_date will be now since that + isn't recorded in NM_Aquifer + """ + # TODO: if row.MonitoringStatus == "Q" is it monitored or not? <-- AMMP review + # TODO: if row.MonitoringStatus == "X" can that change? <-- AMMP review + # TODO: have AMMP review and verify the various MonitoringStatus codes + + target_id = well.id + target_table = "thing" + if not isna(row.MonitoringStatus): + if ( + "X" in row.MonitoringStatus + or "I" in row.MonitoringStatus + or "C" in row.MonitoringStatus + ): + status_value = "Not currently monitored" + else: + status_value = "Currently monitored" + + status_history = StatusHistory( + status_type="Monitoring Status", + status_value=status_value, + reason=row.MonitorStatusReason, + start_date=datetime.now(tz=UTC), + target_id=target_id, + target_table=target_table, + ) + session.add(status_history) + logger.info( + f" Added monitoring status for well {well.name}: {status_value}" + ) + + for code in NMA_MONITORING_FREQUENCY.keys(): + if code in row.MonitoringStatus: + monitoring_frequency = NMA_MONITORING_FREQUENCY[code] + monitoring_frequency_history = MonitoringFrequencyHistory( + thing_id=well.id, + monitoring_frequency=monitoring_frequency, + start_date=datetime.now(tz=UTC), + end_date=None, + ) + session.add(monitoring_frequency_history) + logger.info( + f" Adding '{monitoring_frequency}' monitoring frequency for well {well.name}" + ) + + if not isna(row.Status): + status_value = lexicon_mapper.map_value(f"LU_Status:{row.Status}") + status_history = StatusHistory( + status_type="Well Status", + status_value=status_value, + reason=row.StatusUserNotes, + start_date=datetime.now(tz=UTC), + target_id=target_id, + target_table=target_table, + ) + session.add(status_history) + logger.info(f" Added well status for well {well.name}: {status_value}") - session.commit() + session.commit() + + +class ChunkTransferer(Transferer): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.chunk_size = 1000 + + def chunk_transfer(self): + with session_ctx() as session: + self.input_df, self.cleaned_df = self._get_dfs(session) + df = self._get_df_to_iterate() + for ci, chunk in enumerate(chunk_by_size(df, self.chunk_size)): + dbchunk = self._get_df_chunk(session, chunk) + logger.info( + f"Processing chunk {ci}, {len(chunk)} rows, {len(dbchunk)} db items" + ) + for i, row in enumerate(chunk.itertuples()): + dbitem = self._get_db_item(dbchunk, row) + if not dbitem: + self._missing_db_item_warning(row) + continue + self._chunk_iterator(session, df, i, row, dbitem) + session.commit() - dump_cached_elevations(cached_elevations) - return input_df, cleaned_df, errors + def _get_df_chunk(self, session, chunk): + raise NotImplementedError("Must be implemented in subclass") + def _missing_db_item_warning(self, row): + raise NotImplementedError("Must be implemented in subclass") -def transfer_wellscreens(session, limit=None): + def _chunk_iterator(self, session, df, i, row, dbitem): + raise NotImplementedError("Must be implemented in subclass") - input_df = read_csv("WellScreens") - wdf = replace_nans(input_df) + def _get_db_item(self, chunk, row): + raise NotImplementedError("Must be implemented in subclass") - cleaned_df = filter_to_valid_point_ids(session, wdf) - errors = [] - for ci, chunk in enumerate(chunk_by_size(cleaned_df, 1000)): +class WellScreenTransferer(ChunkTransferer): + def _get_dfs(self, session: Session): + input_df = read_csv("WellScreens") + wdf = replace_nans(input_df) + cleaned_df = filter_to_valid_point_ids(session, wdf) + return input_df, cleaned_df + + def _get_df_chunk(self, session, chunk): things = ( session.query(Thing).filter(Thing.name.in_(chunk.PointID.tolist())).all() ) + return things + + def _get_db_item(self, dbchunk, row): + return next((thing for thing in dbchunk if thing.name == row.PointID), None) + + def _missing_db_item_warning(self, row): + logger.warning(f"Thing with PointID {row.PointID} not found in database.") + + def _chunk_iterator(self, session, df, i, row, db_item): + well_screen_data = { + "thing_id": db_item.id, + "screen_depth_top": row.ScreenTop, + "screen_depth_bottom": row.ScreenBottom, + # "screen_type": row.ScreenType, + "screen_description": row.ScreenDescription, + "release_status": "draft", + "nma_pk_wellscreens": row.GlobalID, + } + try: + # TODO: add validation logic here to ensure no overlapping screens for the same well + CreateWellScreen.model_validate(well_screen_data) + except ValidationError as e: + logger.critical( + f"Validation error for row {i} with PointID {row.PointID}: {e.errors()}" + ) + self.errors.append( + {"pointid": row.PointID, "error": e, "table": "WellScreens"} + ) + return - logger.info(f"Processing chunk {ci}, {len(chunk)} rows, {len(things)} things") - for i, row in enumerate(chunk.itertuples()): - thing = next((thing for thing in things if thing.name == row.PointID), None) - if not thing: - logger.warning( - f"Thing with PointID {row.PointID} not found. Skipping well screen." - ) - continue - - well_screen_data = { - "thing_id": thing.id, - "screen_depth_top": row.ScreenTop, - "screen_depth_bottom": row.ScreenBottom, - # "screen_type": row.ScreenType, - "screen_description": row.ScreenDescription, - "release_status": "draft", - "nma_pk_wellscreens": row.GlobalID, - } - try: - # TODO: add validation logic here to ensure no overlapping screens for the same well - CreateWellScreen.model_validate(well_screen_data) - except ValidationError as e: - logger.critical( - f"Validation error for row {i} with PointID {row.PointID}: {e.errors()}" - ) - errors.append( - {"pointid": row.PointID, "error": e, "table": "WellScreens"} - ) - continue + well_screen = WellScreen(**well_screen_data) + session.add(well_screen) - well_screen = WellScreen(**well_screen_data) - session.add(well_screen) - session.commit() +def transfer_wells(flags: dict = None): + transferer = WellTransferer(flags=flags) + transferer.transfer() + return transferer.input_df, transferer.cleaned_df, transferer.errors - return input_df, cleaned_df, errors + +def transfer_wellscreens(flags: dict = None): + transferer = WellScreenTransferer(flags=flags) + transferer.chunk_transfer() + return transferer.input_df, transferer.cleaned_df, transferer.errors def cleanup_locations(session): @@ -541,3 +623,314 @@ def cleanup_locations(session): # ============= EOF ============================================= +# def transfer_wells_old(session: Session, flags: dict = None, limit: int = 0) -> None: +# # input_df, cleaned_df = get_wells_to_transfer(session, flags) +# # wdf = cleaned_df +# # n = len(wdf) +# +# # step = 25 +# # start_time = time.time() +# errors = [] +# added_locations = {} +# # cached_elevations = get_cached_elevations() +# # for i, row in enumerate(wdf.itertuples()): +# # pointid = row.PointID +# # if wdf[wdf["PointID"] == pointid].shape[0] > 1: +# # logger.critical( +# # f"transfer_wells. PointID {pointid} has duplicate records. Skipping." +# # ) +# # errors.append( +# # { +# # "pointid": pointid, +# # "error": "duplicate records", +# # "table": source_table, +# # "field": "PointID", +# # } +# # ) +# # continue +# +# # if limit and i >= limit: +# # logger.info(f"Reached limit of {limit} rows. Stopping migration.") +# # break +# # +# # if i and not i % step: +# # logger.info( +# # f"Processing row {i} of {n}, avg rows per second: {step / (time.time() - start_time):.2f}" +# # ) +# # start_time = time.time() +# # try: +# # session.commit() +# # except Exception as e: +# # logger.critical(f"Error committing wells. {e}") +# # session.rollback() +# # continue +# +# # location = None +# # try: +# # location, elevation_method = make_location(row, cached_elevations) +# # session.add(location) +# # added_locations[row.PointID] = elevation_method +# # except Exception as e: +# # if location is not None: +# # session.expunge(location) +# # # these rollbacks are cause an issue because they are discarding good data +# # # session.rollback() +# # errors.append( +# # { +# # "pointid": row.PointID, +# # "error": e, +# # "table": "Location", +# # "field": str(e), +# # } +# # ) +# # logger.critical(f"Error making location for {row.PointID}: {e}") +# # continue +# # +# # try: +# # first_visit_date = _get_first_visit_date(row) +# # well_purposes = [] if isna(row.CurrentUse) else _extract_well_purposes(row) +# # well_casing_materials = ( +# # [] if isna(row.CasingDescription) else _extract_casing_materials(row) +# # ) +# # +# # # manually add the well rather than add_well from services/thing_helper.py +# # # so that effective_start can be set on the location assocation +# # +# # data = CreateWell( +# # location_id=location.id, +# # name=row.PointID, +# # first_visit_date=first_visit_date, +# # hole_depth=row.HoleDepth, +# # well_depth=row.WellDepth, +# # well_construction_notes=row.ConstructionNotes, +# # well_casing_diameter=( +# # row.CasingDiameter * 12 if row.CasingDiameter else None +# # ), +# # well_casing_depth=row.CasingDepth, +# # release_status="public" if row.PublicRelease else "private", +# # measuring_point_height=row.MPHeight, +# # measuring_point_description=row.MeasuringPoint, +# # notes=( +# # [{"content": row.Notes, "note_type": "Other"}] if row.Notes else [] +# # ), +# # ) +# # +# # CreateWell.model_validate(data) +# # except ValidationError as e: +# # errors.append({"pointid": row.PointID, "error": e, "table": "WellData"}) +# # logger.critical( +# # f"Validation error for row {i} with PointID {row.PointID}: {e.errors()}" +# # ) +# # continue +# # +# # well = None +# # try: +# # well_data = data.model_dump( +# # exclude=[ +# # "location_id", +# # "group_id", +# # "well_purposes", +# # "well_casing_materials", +# # "measuring_point_height", +# # "measuring_point_description", +# # ] +# # ) +# # well_data["thing_type"] = "water well" +# # well_data["nma_pk_welldata"] = row.WellID +# # +# # well_data.pop("notes") +# # well = Thing(**well_data) +# # session.add(well) +# # # logger.info(f"Created well for {row.PointID}") +# # +# # # flush well to access its ID for status_history +# # # session.flush() +# # +# # # session.commit() +# # # session.refresh(well) +# # # if notes: +# # # for ni in notes: +# # # nn = well.add_note(ni['content'], ni['note_type']) +# # # session.add(nn) +# # +# # if well_purposes: +# # for wp in well_purposes: +# # # TODO: add validation logic here +# # if wp in WellPurposeEnum: +# # wp_obj = WellPurpose(thing=well, purpose=wp) +# # session.add(wp_obj) +# # else: +# # logger.critical(f"{well.name}. Invalid well purpose: {wp}") +# # +# # if well_casing_materials: +# # for wcm in well_casing_materials: +# # # TODO: add validation logic here +# # if wcm in WellCasingMaterialEnum: +# # wcm_obj = WellCasingMaterial(thing=well, material=wcm) +# # session.add(wcm_obj) +# # else: +# # logger.critical( +# # f"{well.name}. Invalid well casing material: {wcm}" +# # ) +# # except Exception as e: +# # if well is not None: +# # session.expunge(well) +# # +# # errors.append({"pointid": row.PointID, "error": e, "table": "WellData"}) +# # logger.critical(f"Error creating well for {row.PointID}: {e}") +# # continue +# # +# # assoc = LocationThingAssociation(effective_start=location.created_at) +# # +# # assoc.location = location +# # assoc.thing = well +# # session.add(assoc) +# +# # session.commit() +# +# # # add things thate need well id +# # for well in session.query(Thing).filter(Thing.thing_type == "water well").all(): +# # row = wdf[wdf["PointID"] == well.name].iloc[0] +# # if not isna(row.Notes): +# # note = well.add_note(row.Notes, "Other") +# # session.add(note) +# # +# # location = well.current_location +# # elevation_method = added_locations[row.PointID] +# # data_provenances = make_location_data_provenance( +# # row, location, elevation_method +# # ) +# # for dp in data_provenances: +# # session.add(dp) +# # +# # """ +# # Developer's note +# # +# # It's not clear when the measuring point from NM_Aquifer was +# # determined, so I'm setting start_date to the day of the transfer +# # """ +# # measuring_point_history = MeasuringPointHistory( +# # thing_id=well.id, +# # measuring_point_height=row.MPHeight, +# # measuring_point_description=row.MeasuringPoint, +# # start_date=datetime.now(tz=UTC), +# # end_date=None, +# # ) +# # session.add(measuring_point_history) +# # +# # """ +# # Developer's notes +# # +# # For all status_history records the start_date will be now since that +# # isn't recorded in NM_Aquifer +# # """ +# # # TODO: if row.MonitoringStatus == "Q" is it monitored or not? <-- AMMP review +# # # TODO: if row.MonitoringStatus == "X" can that change? <-- AMMP review +# # # TODO: have AMMP review and verify the various MonitoringStatus codes +# # +# # target_id = well.id +# # target_table = "thing" +# # if not isna(row.MonitoringStatus): +# # if ( +# # "X" in row.MonitoringStatus +# # or "I" in row.MonitoringStatus +# # or "C" in row.MonitoringStatus +# # ): +# # status_value = "Not currently monitored" +# # else: +# # status_value = "Currently monitored" +# # +# # status_history = StatusHistory( +# # status_type="Monitoring Status", +# # status_value=status_value, +# # reason=row.MonitorStatusReason, +# # start_date=datetime.now(tz=UTC), +# # target_id=target_id, +# # target_table=target_table, +# # ) +# # session.add(status_history) +# # logger.info( +# # f" Added monitoring status for well {well.name}: {status_value}" +# # ) +# # +# # for code in NMA_MONITORING_FREQUENCY.keys(): +# # if code in row.MonitoringStatus: +# # monitoring_frequency = NMA_MONITORING_FREQUENCY[code] +# # monitoring_frequency_history = MonitoringFrequencyHistory( +# # thing_id=well.id, +# # monitoring_frequency=monitoring_frequency, +# # start_date=datetime.now(tz=UTC), +# # end_date=None, +# # ) +# # session.add(monitoring_frequency_history) +# # logger.info( +# # f" Adding '{monitoring_frequency}' monitoring frequency for well {well.name}" +# # ) +# # +# # if not isna(row.Status): +# # status_value = lexicon_mapper.map_value(f"LU_Status:{row.Status}") +# # status_history = StatusHistory( +# # status_type="Well Status", +# # status_value=status_value, +# # reason=row.StatusUserNotes, +# # start_date=datetime.now(tz=UTC), +# # target_id=target_id, +# # target_table=target_table, +# # ) +# # session.add(status_history) +# # logger.info(f" Added well status for well {well.name}: {status_value}") +# # +# # session.commit() +# # +# # dump_cached_elevations(cached_elevations) +# # return input_df, cleaned_df, errors + +# def transfer_wellscreens_old(session, limit=None): + +# input_df = read_csv("WellScreens") +# wdf = replace_nans(input_df) +# +# cleaned_df = filter_to_valid_point_ids(session, wdf) + +# errors = [] +# for ci, chunk in enumerate(chunk_by_size(cleaned_df, 1000)): +# things = ( +# session.query(Thing).filter(Thing.name.in_(chunk.PointID.tolist())).all() +# ) +# +# logger.info(f"Processing chunk {ci}, {len(chunk)} rows, {len(things)} things") +# for i, row in enumerate(chunk.itertuples()): +# thing = next((thing for thing in things if thing.name == row.PointID), None) +# if not thing: +# logger.warning( +# f"Thing with PointID {row.PointID} not found. Skipping well screen." +# ) +# continue +# +# well_screen_data = { +# "thing_id": thing.id, +# "screen_depth_top": row.ScreenTop, +# "screen_depth_bottom": row.ScreenBottom, +# # "screen_type": row.ScreenType, +# "screen_description": row.ScreenDescription, +# "release_status": "draft", +# "nma_pk_wellscreens": row.GlobalID, +# } +# try: +# # TODO: add validation logic here to ensure no overlapping screens for the same well +# CreateWellScreen.model_validate(well_screen_data) +# except ValidationError as e: +# logger.critical( +# f"Validation error for row {i} with PointID {row.PointID}: {e.errors()}" +# ) +# errors.append( +# {"pointid": row.PointID, "error": e, "table": "WellScreens"} +# ) +# continue +# +# well_screen = WellScreen(**well_screen_data) +# session.add(well_screen) +# +# session.commit() +# +# return input_df, cleaned_df, errors From cee74b4e355a2c2c8824f4c3d16b4dbc91128194 Mon Sep 17 00:00:00 2001 From: jakeross Date: Wed, 26 Nov 2025 08:21:11 -0700 Subject: [PATCH 03/66] refactor: improve error handling and logging in sensor transfer and deployment processes --- transfers/sensor_transfer.py | 17 ++-- transfers/transfer.py | 32 ++----- transfers/util.py | 92 ++++++++++++++++++-- transfers/waterlevels_transducer_transfer.py | 85 +++++++++++++----- transfers/well_transfer.py | 35 ++++---- 5 files changed, 184 insertions(+), 77 deletions(-) diff --git a/transfers/sensor_transfer.py b/transfers/sensor_transfer.py index 90e7273f1..0f0b98074 100644 --- a/transfers/sensor_transfer.py +++ b/transfers/sensor_transfer.py @@ -138,7 +138,7 @@ def transfer_sensors(session): try: recording_interval = int(row.RecordingInterval) except (ValueError, TypeError): - + error = "RecordingInterval is not an integer" # try to calculate recording interval from measurements if sensor_type in estimators: estimator = estimators[sensor_type] @@ -146,27 +146,26 @@ def transfer_sensors(session): estimator = RecordingIntervalEstimator(sensor_type) estimators[sensor_type] = estimator - recording_interval, unit = estimator.estimate_recording_interval( - row, installation_date, removal_date + recording_interval, unit, error = ( + estimator.estimate_recording_interval( + row, installation_date, removal_date + ) ) if recording_interval: recording_interval_unit = unit logger.info( f"name={sensor.name}, serial_no={sensor.serial_no}. " - f"estimated recording interval: {recording_interval} " + f"estimated recording interval: {recording_interval} {unit}" ) else: - logger.critical( - f"name={sensor.name}, serial_no={sensor.serial_no} RecordingInterval is not an integer" + f"name={sensor.name}, serial_no={sensor.serial_no} error={error}" ) - errors.append( { "pointid": pointid, - "error": f"row.ID={row.ID}, row.SerialNo={row.SerialNo}. RecordingInterval is " - f"not an integer", + "error": f"name={sensor.name}, row.SerialNo={row.SerialNo}. error={error}", "table": source_table, "field": "RecordingInterval", } diff --git a/transfers/transfer.py b/transfers/transfer.py index 2098a85ea..5ad8f8c11 100644 --- a/transfers/transfer.py +++ b/transfers/transfer.py @@ -162,29 +162,19 @@ def transfer_debugging(sess, metrics, limit=100): message("TRANSFERRING CONTACTS") results = timeit_direct(transfer_contacts, sess) metrics.contact_metrics(sess, *results) - # + message("TRANSFERRING WATER LEVELS") results = timeit_direct(transfer_water_levels, sess) metrics.water_level_metrics(sess, *results) - # message("TRANSFERRING WATER LEVELS PRESSURE") - # results = timeit_direct(transfer_water_levels_pressure, sess) - # metrics.pressure_metrics(sess, *results) - - # message("TRANSFERRING WATER LEVELS ACOUSTIC") - # results = timeit_direct(transfer_water_levels_acoustic, sess) - # metrics.acoustic_metrics(sess, *results) + message("TRANSFERRING WATER LEVELS PRESSURE") + results = timeit_direct(transfer_water_levels_pressure, sess) + metrics.pressure_metrics(sess, *results) - """ - Developer's notes + message("TRANSFERRING WATER LEVELS ACOUSTIC") + results = timeit_direct(transfer_water_levels_acoustic, sess) + metrics.acoustic_metrics(sess, *results) - When transfering water chemistry data use the qc_type field to indicate - normal/blanks/duplicates instead of what comes from LU_SampleType. Use - those values, however, to map to the standard qc_type fields if applicable - (i.e. not applicable when sample type is "Soil or rock sample" or - "Precipitation," but is applicable when sample type is "Equipment blank" - or "Field duplicate") - """ # message("TRANSFERRING LINK IDS") # timeit_direct(transfer_link_ids, sess) # timeit_direct(transfer_link_ids_welldata, sess) @@ -192,20 +182,16 @@ def transfer_debugging(sess, metrics, limit=100): # message("TRANSFERRING GROUPS") # timeit_direct(transfer_groups, sess) - # message("TRANSFERRING WATER LEVELS ACOUSTIC") - # timeit_direct(transfer_water_levels_acoustic, sess) # message("TRANSFERRING ASSETS") # timeit_direct(transfer_assets, sess) - metrics.close() - metrics.save_to_storage_bucket() def main(): message("START--------------------------------------") - limit = int(os.environ.get("TRANSFER_LIMIT", 1000)) + limit = int(os.getenv("TRANSFER_LIMIT", 1000)) metrics = Metrics() with session_ctx() as sess: - if int(os.environ.get("TRANSFER_DEBUG", 0)): + if int(os.getenv("TRANSFER_DEBUG", 0)): transfer_debugging(sess, metrics, limit=limit) else: transfer_all(sess, metrics, limit=limit) diff --git a/transfers/util.py b/transfers/util.py index 31ad32e0a..a74a6a9d0 100644 --- a/transfers/util.py +++ b/transfers/util.py @@ -15,9 +15,10 @@ # =============================================================================== import csv import io +import math import os import re -from datetime import datetime, timezone, timedelta +from datetime import datetime, timezone, timedelta, UTC from pathlib import Path import numpy as np @@ -53,6 +54,58 @@ } +class MeasuringPointEstimator: + def __init__(self): + df = read_csv("WaterLevels") + df["DateMeasured"] = pd.to_datetime(df["DateMeasured"], errors="coerce") + self._df = df.dropna(subset=["DateMeasured"]) + + def estimate_measuring_point_height( + self, row + ) -> tuple[float, str, datetime | None]: + mph = row.MPHeight + mph_desc = row.MeasuringPoint + + df = self._df[self._df["PointID"] == row.PointID] + df = df.sort_values("DateMeasured") + if mph is None: + logger.info( + f"No MPHeight found for PointID: {row.PointID}. Estimating from measurements." + ) + # try to estimate mpheight from measurements + mphs = [] + start_dates = [] + mph_descs = [] + for m in df.itertuples(): + mphi = m.DepthToWater - m.DepthToWaterBGS + start_date = m.DateMeasured + if mphi not in mphs: + mphs.append(mphi) + mph_descs.append( + "Auto calculated from measurements at depth to water and depth to water below ground surface" + ) + start_dates.append(start_date) + + else: + mphs = [mph] + mph_descs = [mph_desc] + if len(df) > 0: + start_dates = [df["DateMeasured"].min()] + else: + start_dates = [datetime.now(tz=UTC)] + + if len(mphs) == 1: + end_dates = [None] + else: + end_dates = [start_dates[i + 1] for i in range(len(start_dates) - 1)] + end_dates.append(None) + + logger.info( + f"Estimated MPHeight: {mph}, {start_dates} for PointID: {row.PointID}." + ) + return zip(mphs, mph_descs, start_dates, end_dates) + + class RecordingIntervalEstimator: def __init__(self, sensor_type: str): if sensor_type == "Pressure Transducer": @@ -68,12 +121,12 @@ def estimate_recording_interval( record: pd.Series, installation_date: datetime = None, removal_date: datetime = None, - ): + ) -> tuple[int | None, str | None, str | None]: point_id = record.PointID cdf = self._df[self._df["PointID"] == point_id] if len(cdf) == 0: - return None, None + return None, None, f"No measurements found for PointID: {point_id}" cdf = cdf.sort_values("DateMeasured") if installation_date is not None: @@ -86,24 +139,47 @@ def estimate_recording_interval( date_series = pd.to_datetime(cdf["DateMeasured"]) intervals = date_series.diff().dropna().dt.total_seconds() if len(intervals) == 0: - avg_interval = None + logger.warning( + f"No intervals found for {point_id} for time range " + f"{installation_date}-{removal_date}. using entire series " + ) + # take average of entire series + df = self._df[self._df["PointID"] == point_id] + df = df.sort_values("DateMeasured") + date_series = pd.to_datetime(df["DateMeasured"]) + intervals = date_series.diff().dropna().dt.total_seconds() + if len(intervals) == 0: + return ( + None, + None, + f"No measurements found for {point_id} for entire series", + ) + else: + avg_interval = intervals.mean() else: avg_interval = intervals.mean() except IndexError: - return None, None + return ( + None, + None, + ( + f"Not enough measurements to calculate interval for PointID: {point_id}," + f"{installation_date} to {removal_date}." + ), + ) # convert to hours avg_interval /= 3600 unit = "hour" - if avg_interval < 1: + if avg_interval < 0.95: # if less then 57 minutes convert to minutes avg_interval *= 60 unit = "minute" - if avg_interval < 1: + if avg_interval < 0.95: # if less then 57 seconds convert to seconds avg_interval *= 60 unit = "second" - return int(avg_interval), unit + return math.ceil(avg_interval), unit, None def replace_nans(df: pd.DataFrame, default=None) -> pd.DataFrame: diff --git a/transfers/waterlevels_transducer_transfer.py b/transfers/waterlevels_transducer_transfer.py index 64e39b439..c6e76005c 100644 --- a/transfers/waterlevels_transducer_transfer.py +++ b/transfers/waterlevels_transducer_transfer.py @@ -34,6 +34,17 @@ def transfer_water_levels_pressure(session): return _transfer_water_levels_continuous(session, wd, "QCed", "Pressure Transducer") +def _find_deployment(ts, deployments): + for d in deployments: + start = Timestamp(d.installation_date) + if start > ts: + break # because sorted by start + end = Timestamp(d.removal_date) if d.removal_date else Timestamp.max + if end >= ts: + return d + return None + + def _transfer_water_levels_continuous(session, input_df, partition_field, sensor_type): from schemas.transducer import CreateTransducerObservation @@ -46,11 +57,16 @@ def _transfer_water_levels_continuous(session, input_df, partition_field, sensor cleaned_df = filter_to_valid_point_ids(session, input_df) # group by pointid + cleaned_df = cleaned_df.sort_values(by=["PointID"]) gwd = cleaned_df.groupby(["PointID"]) + n = len(gwd) errors = [] - for index, group in gwd: + nodeployments = {} + for i, (index, group) in enumerate(gwd): pointid = index[0] - logger.info(f"Processing PointID: {pointid}") + logger.info( + f"Processing PointID: {pointid}. {i + 1}/{n} ({100*(i+1)/n:0.2f}) completed." + ) deployments = ( session.query(Deployment) @@ -98,27 +114,47 @@ def _transfer_water_levels_continuous(session, input_df, partition_field, sensor continue observations = [] + + # min_deployment_date = Timestamp(min([d.installation_date for d in deployments])) + # max_deployment_date = Timestamp(max([d.removal_date or d.installation_date for d in deployments])) + deps_sorted = sorted( + deployments, key=lambda d: Timestamp(d.installation_date) + ) + for row in rows.itertuples(): - deployment = next( - ( - d - for d in deployments - if Timestamp(d.installation_date) <= row.DateMeasured - and ( - d.removal_date is None - or Timestamp(d.removal_date) >= row.DateMeasured - ) - ), - None, - ) + deployment = _find_deployment(row.DateMeasured, deps_sorted) + + # if min_deployment_date < row.DateMeasured < max_deployment_date: + # deployment = next( + # ( + # d + # for d in deployments + # if Timestamp(d.installation_date) <= row.DateMeasured + # and ( + # d.removal_date is None + # or Timestamp(d.removal_date) >= row.DateMeasured + # ) + # ), + # None, + # ) if deployment is None: - errors.append( - { - "pointid": pointid, - "error": f"no deployment at {row.DateMeasured}", - } - ) + # errors.append( + # { + # "pointid": pointid, + # "error": f"no deployment at {row.DateMeasured}", + # } + # ) + if pointid not in nodeployments: + nodeployments[pointid] = (row.DateMeasured, row.DateMeasured) + else: + min_date, max_date = nodeployments[pointid] + if row.DateMeasured < min_date: + min_date = row.DateMeasured + elif row.DateMeasured > max_date: + max_date = row.DateMeasured + nodeployments[pointid] = min_date, max_date + logger.critical( f"No deployment found for PointID={pointid} at {row.DateMeasured}" ) @@ -155,6 +191,15 @@ def _transfer_water_levels_continuous(session, input_df, partition_field, sensor session.rollback() continue + # convert nodeployments to errors + for pointid, (min_date, max_date) in nodeployments.items(): + errors.append( + { + "pointid": pointid, + "error": f"no deployment between {min_date} and {max_date}", + } + ) + return input_df, cleaned_df, errors diff --git a/transfers/well_transfer.py b/transfers/well_transfer.py index bb393c17f..a8a8a22b4 100644 --- a/transfers/well_transfer.py +++ b/transfers/well_transfer.py @@ -56,6 +56,7 @@ lexicon_mapper, filter_non_transferred_wells, chunk_by_size, + MeasuringPointEstimator, ) ADDED = [] @@ -291,8 +292,10 @@ def _iterator(self, session, df, i, row): ), well_casing_depth=row.CasingDepth, release_status="public" if row.PublicRelease else "private", - measuring_point_height=row.MPHeight, - measuring_point_description=row.MeasuringPoint, + measuring_point_height=0, + measuring_point_description="", + # measuring_point_height=row.MPHeight, + # measuring_point_description=row.MeasuringPoint, notes=( [{"content": row.Notes, "note_type": "Other"}] if row.Notes else [] ), @@ -375,7 +378,7 @@ def _iterator(self, session, df, i, row): def _after_hook(self, session): dump_cached_elevations(self._cached_elevations) - + measuring_point_estimator = MeasuringPointEstimator() # add things thate need well id for well in session.query(Thing).filter(Thing.thing_type == "water well").all(): row = self.cleaned_df[self.cleaned_df["PointID"] == well.name].iloc[0] @@ -391,20 +394,18 @@ def _after_hook(self, session): for dp in data_provenances: session.add(dp) - """ - Developer's note - - It's not clear when the measuring point from NM_Aquifer was - determined, so I'm setting start_date to the day of the transfer - """ - measuring_point_history = MeasuringPointHistory( - thing_id=well.id, - measuring_point_height=row.MPHeight, - measuring_point_description=row.MeasuringPoint, - start_date=datetime.now(tz=UTC), - end_date=None, - ) - session.add(measuring_point_history) + mphs = measuring_point_estimator.estimate_measuring_point_height(row) + + for mph, mph_desc, start_date, end_date in mphs: + measuring_point_history = MeasuringPointHistory( + thing_id=well.id, + measuring_point_height=mph, + measuring_point_description=mph_desc, + # start_date=datetime.now(tz=UTC), + start_date=start_date, + end_date=end_date, + ) + session.add(measuring_point_history) """ Developer's notes From cb8c81f8de098f75c531ee3f8300ffd3371bdcbd Mon Sep 17 00:00:00 2001 From: jakeross Date: Wed, 26 Nov 2025 08:23:04 -0700 Subject: [PATCH 04/66] refactor: remove commented-out code for deployment date checks in waterlevels_transducer_transfer.py --- transfers/waterlevels_transducer_transfer.py | 22 -------------------- 1 file changed, 22 deletions(-) diff --git a/transfers/waterlevels_transducer_transfer.py b/transfers/waterlevels_transducer_transfer.py index c6e76005c..f1ef30cd1 100644 --- a/transfers/waterlevels_transducer_transfer.py +++ b/transfers/waterlevels_transducer_transfer.py @@ -115,8 +115,6 @@ def _transfer_water_levels_continuous(session, input_df, partition_field, sensor observations = [] - # min_deployment_date = Timestamp(min([d.installation_date for d in deployments])) - # max_deployment_date = Timestamp(max([d.removal_date or d.installation_date for d in deployments])) deps_sorted = sorted( deployments, key=lambda d: Timestamp(d.installation_date) ) @@ -124,27 +122,7 @@ def _transfer_water_levels_continuous(session, input_df, partition_field, sensor for row in rows.itertuples(): deployment = _find_deployment(row.DateMeasured, deps_sorted) - # if min_deployment_date < row.DateMeasured < max_deployment_date: - # deployment = next( - # ( - # d - # for d in deployments - # if Timestamp(d.installation_date) <= row.DateMeasured - # and ( - # d.removal_date is None - # or Timestamp(d.removal_date) >= row.DateMeasured - # ) - # ), - # None, - # ) - if deployment is None: - # errors.append( - # { - # "pointid": pointid, - # "error": f"no deployment at {row.DateMeasured}", - # } - # ) if pointid not in nodeployments: nodeployments[pointid] = (row.DateMeasured, row.DateMeasured) else: From bd9a2955f5168f5281de54eebfa36b484e700ffd Mon Sep 17 00:00:00 2001 From: jakeross Date: Wed, 26 Nov 2025 11:04:59 -0700 Subject: [PATCH 05/66] refactor: streamline transfer function calls by consolidating flags usage --- transfers/transfer.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/transfers/transfer.py b/transfers/transfer.py index 5ad8f8c11..15c3cc40f 100644 --- a/transfers/transfer.py +++ b/transfers/transfer.py @@ -61,13 +61,14 @@ def transfer_all(sess, metrics, limit=100): flags = { "TRANSFER_ALL_WELLS": True, "TRANSFER_ALL_WELLSCREENS": True, + "LIMIT": limit, } - results = timeit_direct(transfer_wells, sess, flags=flags, limit=limit) + results = timeit_direct(transfer_wells, flags=flags) metrics.well_metrics(sess, *results) message("TRANSFERRING WELL SCREENS") - results = timeit_direct(transfer_wellscreens, sess) + results = timeit_direct(transfer_wellscreens, flags=flags) metrics.well_screen_metrics(sess, *results) message("TRANSFERRING SENSORS") From 0283aeee6d1c7414e5f075cebcff64ab3193a844 Mon Sep 17 00:00:00 2001 From: jakeross Date: Wed, 26 Nov 2025 16:46:08 -0700 Subject: [PATCH 06/66] refactor: implement SensorTransferer class for improved sensor data handling and transfer process --- transfers/sensor_transfer.py | 552 +++++++++++++++++++++++------------ transfers/transfer.py | 28 +- transfers/transferer.py | 189 ++++++++++++ transfers/util.py | 7 +- transfers/well_transfer.py | 450 ++-------------------------- 5 files changed, 595 insertions(+), 631 deletions(-) create mode 100644 transfers/transferer.py diff --git a/transfers/sensor_transfer.py b/transfers/sensor_transfer.py index 0f0b98074..6c9a75cbc 100644 --- a/transfers/sensor_transfer.py +++ b/transfers/sensor_transfer.py @@ -18,6 +18,7 @@ from sqlalchemy import select from db import Sensor, Deployment, Thing +from transfers.transferer import ThingBasedTransferer from transfers.util import ( read_csv, logger, @@ -33,207 +34,378 @@ } -def transfer_sensors(session): +class SensorTransferer(ThingBasedTransferer): source_table = "Equipment" - input_df = read_csv(source_table) - input_df.columns = input_df.columns.str.replace(" ", "_") - input_df = input_df[input_df.SerialNo.notna()] - cleaned_df = filter_to_valid_point_ids(session, input_df) - cleaned_df = replace_nans(cleaned_df) - errors = [] - grouped_equipment = cleaned_df.groupby(["PointID"]) - added = {} - estimators = {} - for index, group in grouped_equipment: - pointid = index[0] - thing = session.query(Thing).filter(Thing.name == pointid).first() - if thing is None: - logger.warning( - f"Skipping sensor transfer for Thing with PointID {pointid} since it is not in the DB" - ) - continue - ordered_group = group.sort_values(by=["DateInstalled"]) + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._estimators = {} + self._added = {} + + def _get_dfs(self, session): + input_df = read_csv(self.source_table) + input_df.columns = input_df.columns.str.replace(" ", "_") + input_df = input_df[input_df.SerialNo.notna()] + cleaned_df = filter_to_valid_point_ids(session, input_df) + cleaned_df = replace_nans(cleaned_df) + return input_df, cleaned_df + + def _no_db_item_warning(self, index): + return f"Skipping sensor transfer for Thing with PointID {index[0]} since it is not in the DB" + + def _get_prepped_group(self, group): + return group.sort_values(by=["DateInstalled"]) + + def _step(self, session, row, db_item): + pointid = self._get_point_id(row, db_item) try: - for row in ordered_group.itertuples(): - try: - sensor_type = EQUIPMENT_TO_SENSOR_TYPE_MAP[row.EquipmentType] - except KeyError as e: - logger.critical( - f"Skipping equipment with type {row.EquipmentType} for point {pointid}" - ) - error = ( - f"key error adding sensor_type:{row.EquipmentType} error: {e}" - ) - errors.append( - { - "pointid": pointid, - "error": error, - "table": source_table, - "field": "EquipmentType", - } - ) - continue - - if row.SerialNo in added: - logger.info( - f"Sensor with serial number {row.SerialNo} already added in this transfer session. Only creating deployment for that record" - ) - sensor = added[row.SerialNo] - else: - sensor = ( - session.query(Sensor) - .filter(Sensor.serial_no == row.SerialNo) - .one_or_none() - ) - if sensor: - logger.info( - f"Sensor with serial number {row.SerialNo} already exists. Only creating deployment for that record" - ) - - if not sensor: - # TODO: Add validation - sensor = Sensor( - nma_pk_equipment=row.GlobalID, - name=row.ID, - sensor_type=sensor_type, - model=row.Model, - serial_no=row.SerialNo, - owner_agency="NMBGMR", - notes=row.Equipment_Notes, - ) - added[row.SerialNo] = sensor - session.add(sensor) - logger.info( - f"Added sensor {sensor.name} with serial number {sensor.serial_no}" - ) - - if row.DateInstalled: - installation_date = datetime.strptime( - row.DateInstalled, "%Y-%m-%d %H:%M:%S.%f" - ).date() - else: - logger.critical( - f"Installation Date cannot be None. Skipping deployment. Sensor: {row.ID}, " - f"SerialNo: {row.SerialNo} PointID: {pointid}" - ) - errors.append( - { - "pointid": pointid, - "error": f"row.ID={row.ID}, row.SerialNo={row.SerialNo}. Installation Date cannot " - f"be None", - "table": source_table, - "field": "DateInstalled", - } - ) - continue - - removal_date = None - if row.DateRemoved: - removal_date = datetime.strptime( - row.DateRemoved, "%Y-%m-%d %H:%M:%S.%f" - ).date() - - recording_interval_unit = "hour" - try: - recording_interval = int(row.RecordingInterval) - except (ValueError, TypeError): - error = "RecordingInterval is not an integer" - # try to calculate recording interval from measurements - if sensor_type in estimators: - estimator = estimators[sensor_type] - else: - estimator = RecordingIntervalEstimator(sensor_type) - estimators[sensor_type] = estimator - - recording_interval, unit, error = ( - estimator.estimate_recording_interval( - row, installation_date, removal_date - ) - ) - - if recording_interval: - recording_interval_unit = unit - logger.info( - f"name={sensor.name}, serial_no={sensor.serial_no}. " - f"estimated recording interval: {recording_interval} {unit}" - ) - else: - logger.critical( - f"name={sensor.name}, serial_no={sensor.serial_no} error={error}" - ) - errors.append( - { - "pointid": pointid, - "error": f"name={sensor.name}, row.SerialNo={row.SerialNo}. error={error}", - "table": source_table, - "field": "RecordingInterval", - } - ) - sql = ( - select(Deployment) - .join(Thing) - .join(Sensor) - .where(Thing.name == pointid) - .where(Sensor.serial_no == sensor.serial_no) - .where(Deployment.installation_date == installation_date) - .where(Deployment.removal_date == removal_date) - ) + sensor_type = EQUIPMENT_TO_SENSOR_TYPE_MAP[row.EquipmentType] + except KeyError as e: + logger.critical( + f"Skipping equipment with type {row.EquipmentType} for point {pointid}" + ) + error = f"key error adding sensor_type:{row.EquipmentType} error: {e}" + self.errors.append( + { + "pointid": pointid, + "error": error, + "table": self.source_table, + "field": "EquipmentType", + } + ) + return - existing_deployment = session.execute(sql).scalars().one_or_none() - if existing_deployment: - logger.info("existing deployment") - continue - - # TODO: add validation - deployment = Deployment( - thing=thing, - sensor=sensor, - installation_date=installation_date, - removal_date=removal_date, - recording_interval=recording_interval, - recording_interval_units=recording_interval_unit, - hanging_cable_length=row.HangingCableLength, - hanging_point_height=row.HangingPointHgt, - hanging_point_description=row.HangingPointDescription, + if row.SerialNo in self._added: + logger.info( + f"Sensor with serial number {row.SerialNo} already added in this transfer session. Only creating deployment for that record" + ) + sensor = self._added[row.SerialNo] + else: + sensor = ( + session.query(Sensor) + .filter(Sensor.serial_no == row.SerialNo) + .one_or_none() + ) + if sensor: + logger.info( + f"Sensor with serial number {row.SerialNo} already exists. Only creating deployment for that record" ) - session.add(deployment) + + if not sensor: + # TODO: Add validation + sensor = Sensor( + nma_pk_equipment=row.GlobalID, + name=row.ID, + sensor_type=sensor_type, + model=row.Model, + serial_no=row.SerialNo, + owner_agency="NMBGMR", + notes=row.Equipment_Notes, + ) + self._added[row.SerialNo] = sensor + session.add(sensor) + logger.info( + f"Added sensor {sensor.name} with serial number {sensor.serial_no}" + ) + + if row.DateInstalled: + installation_date = datetime.strptime( + row.DateInstalled, "%Y-%m-%d %H:%M:%S.%f" + ).date() + else: + pointid = self._get_point_id(row) + logger.critical( + f"Installation Date cannot be None. Skipping deployment. Sensor: {row.ID}, " + f"SerialNo: {row.SerialNo} PointID: {pointid}" + ) + self.errors.append( + { + "pointid": pointid, + "error": f"row.ID={row.ID}, row.SerialNo={row.SerialNo}. Installation Date cannot " + f"be None", + "table": self.source_table, + "field": "DateInstalled", + } + ) + return + + removal_date = None + if row.DateRemoved: + removal_date = datetime.strptime( + row.DateRemoved, "%Y-%m-%d %H:%M:%S.%f" + ).date() + + recording_interval_unit = "hour" + try: + recording_interval = int(row.RecordingInterval) + except (ValueError, TypeError): + # try to calculate recording interval from measurements + if sensor_type in self._estimators: + estimator = self._estimators[sensor_type] + else: + estimator = RecordingIntervalEstimator(sensor_type) + self._estimators[sensor_type] = estimator + + recording_interval, unit, error = estimator.estimate_recording_interval( + row, installation_date, removal_date + ) + + if recording_interval: + recording_interval_unit = unit logger.info( - f"Added deployment for sensor with serial number {sensor.serial_no}, deployed to {thing.name}: | Installation Date: {installation_date} | Removal Date: {removal_date}" + f"name={sensor.name}, serial_no={sensor.serial_no}. " + f"estimated recording interval: {recording_interval} {unit}" + ) + else: + logger.critical( + f"name={sensor.name}, serial_no={sensor.serial_no} error={error}" ) - """ - Developer's notes + self.errors.append( + { + "pointid": pointid, + "error": f"name={sensor.name}, row.SerialNo={row.SerialNo}. error={error}", + "table": self.source_table, + "field": "RecordingInterval", + } + ) + + sql = ( + select(Deployment) + .join(Thing) + .join(Sensor) + .where(Thing.name == pointid) + .where(Sensor.serial_no == sensor.serial_no) + .where(Deployment.installation_date == installation_date) + .where(Deployment.removal_date == removal_date) + ) + + existing_deployment = session.execute(sql).scalars().one_or_none() + if existing_deployment: + logger.info("existing deployment") + return - Since it's unclear beforehand if a sensor has been removed just update - the sensor_status based off of each deployments installation/removal - dates - """ - if installation_date: - sensor.sensor_status = "In Service" - if removal_date: - sensor.sensor_status = "Retired" - session.commit() - except Exception as e: - import traceback + # TODO: add validation + deployment = Deployment( + thing=db_item, + sensor=sensor, + installation_date=installation_date, + removal_date=removal_date, + recording_interval=recording_interval, + recording_interval_units=recording_interval_unit, + hanging_cable_length=row.HangingCableLength, + hanging_point_height=row.HangingPointHgt, + hanging_point_description=row.HangingPointDescription, + ) + session.add(deployment) + logger.info( + f"Added deployment for sensor with serial number {sensor.serial_no}, deployed to {db_item.name}: | " + f"Installation Date: {installation_date} | Removal Date: {removal_date}" + ) - traceback.print_exc() - logger.critical(f"Could not add sensor and deployment: {e}") - errors.append({"pointid": pointid, "error": e, "table": source_table}) + """ + Developer's notes - return input_df, cleaned_df, errors + Since it's unclear beforehand if a sensor has been removed just update + the sensor_status based off of each deployments installation/removal + dates + """ + if installation_date: + sensor.sensor_status = "In Service" + if removal_date: + sensor.sensor_status = "Retired" + + +# def transfer_sensors(session): +# source_table = "Equipment" +# input_df = read_csv(source_table) +# input_df.columns = input_df.columns.str.replace(" ", "_") +# input_df = input_df[input_df.SerialNo.notna()] +# cleaned_df = filter_to_valid_point_ids(session, input_df) +# cleaned_df = replace_nans(cleaned_df) +# errors = [] +# grouped_equipment = cleaned_df.groupby(["PointID"]) +# added = {} +# estimators = {} +# for index, group in grouped_equipment: +# pointid = index[0] +# thing = session.query(Thing).filter(Thing.name == pointid).first() +# if thing is None: +# logger.warning( +# f"Skipping sensor transfer for Thing with PointID {pointid} since it is not in the DB" +# ) +# continue +# ordered_group = group.sort_values(by=["DateInstalled"]) +# +# try: +# for row in ordered_group.itertuples(): +# try: +# sensor_type = EQUIPMENT_TO_SENSOR_TYPE_MAP[row.EquipmentType] +# except KeyError as e: +# logger.critical( +# f"Skipping equipment with type {row.EquipmentType} for point {pointid}" +# ) +# error = ( +# f"key error adding sensor_type:{row.EquipmentType} error: {e}" +# ) +# errors.append( +# { +# "pointid": pointid, +# "error": error, +# "table": source_table, +# "field": "EquipmentType", +# } +# ) +# continue +# +# if row.SerialNo in added: +# logger.info( +# f"Sensor with serial number {row.SerialNo} already added in this transfer session. Only creating deployment for that record" +# ) +# sensor = added[row.SerialNo] +# else: +# sensor = ( +# session.query(Sensor) +# .filter(Sensor.serial_no == row.SerialNo) +# .one_or_none() +# ) +# if sensor: +# logger.info( +# f"Sensor with serial number {row.SerialNo} already exists. Only creating deployment for that record" +# ) +# +# if not sensor: +# # TODO: Add validation +# sensor = Sensor( +# nma_pk_equipment=row.GlobalID, +# name=row.ID, +# sensor_type=sensor_type, +# model=row.Model, +# serial_no=row.SerialNo, +# owner_agency="NMBGMR", +# notes=row.Equipment_Notes, +# ) +# added[row.SerialNo] = sensor +# session.add(sensor) +# logger.info( +# f"Added sensor {sensor.name} with serial number {sensor.serial_no}" +# ) +# +# if row.DateInstalled: +# installation_date = datetime.strptime( +# row.DateInstalled, "%Y-%m-%d %H:%M:%S.%f" +# ).date() +# else: +# logger.critical( +# f"Installation Date cannot be None. Skipping deployment. Sensor: {row.ID}, " +# f"SerialNo: {row.SerialNo} PointID: {pointid}" +# ) +# errors.append( +# { +# "pointid": pointid, +# "error": f"row.ID={row.ID}, row.SerialNo={row.SerialNo}. Installation Date cannot " +# f"be None", +# "table": source_table, +# "field": "DateInstalled", +# } +# ) +# continue +# +# removal_date = None +# if row.DateRemoved: +# removal_date = datetime.strptime( +# row.DateRemoved, "%Y-%m-%d %H:%M:%S.%f" +# ).date() +# +# recording_interval_unit = "hour" +# try: +# recording_interval = int(row.RecordingInterval) +# except (ValueError, TypeError): +# error = "RecordingInterval is not an integer" +# # try to calculate recording interval from measurements +# if sensor_type in estimators: +# estimator = estimators[sensor_type] +# else: +# estimator = RecordingIntervalEstimator(sensor_type) +# estimators[sensor_type] = estimator +# +# recording_interval, unit, error = ( +# estimator.estimate_recording_interval( +# row, installation_date, removal_date +# ) +# ) +# +# if recording_interval: +# recording_interval_unit = unit +# logger.info( +# f"name={sensor.name}, serial_no={sensor.serial_no}. " +# f"estimated recording interval: {recording_interval} {unit}" +# ) +# else: +# logger.critical( +# f"name={sensor.name}, serial_no={sensor.serial_no} error={error}" +# ) +# errors.append( +# { +# "pointid": pointid, +# "error": f"name={sensor.name}, row.SerialNo={row.SerialNo}. error={error}", +# "table": source_table, +# "field": "RecordingInterval", +# } +# ) +# sql = ( +# select(Deployment) +# .join(Thing) +# .join(Sensor) +# .where(Thing.name == pointid) +# .where(Sensor.serial_no == sensor.serial_no) +# .where(Deployment.installation_date == installation_date) +# .where(Deployment.removal_date == removal_date) +# ) +# +# existing_deployment = session.execute(sql).scalars().one_or_none() +# if existing_deployment: +# logger.info("existing deployment") +# continue +# +# # TODO: add validation +# deployment = Deployment( +# thing=thing, +# sensor=sensor, +# installation_date=installation_date, +# removal_date=removal_date, +# recording_interval=recording_interval, +# recording_interval_units=recording_interval_unit, +# hanging_cable_length=row.HangingCableLength, +# hanging_point_height=row.HangingPointHgt, +# hanging_point_description=row.HangingPointDescription, +# ) +# session.add(deployment) +# logger.info( +# f"Added deployment for sensor with serial number {sensor.serial_no}, deployed to {thing.name}: | Installation Date: {installation_date} | Removal Date: {removal_date}" +# ) +# +# """ +# Developer's notes +# +# Since it's unclear beforehand if a sensor has been removed just update +# the sensor_status based off of each deployments installation/removal +# dates +# """ +# if installation_date: +# sensor.sensor_status = "In Service" +# if removal_date: +# sensor.sensor_status = "Retired" +# session.commit() +# except Exception as e: +# import traceback +# +# traceback.print_exc() +# logger.critical(f"Could not add sensor and deployment: {e}") +# errors.append({"pointid": pointid, "error": e, "table": source_table}) +# +# return input_df, cleaned_df, errors # ============= EOF ============================================= -def init_sensor(session): - sensor = Sensor() - sensor.name = "Groundwater level manual measurement" - sensor.description = "manual gwl measurement. needs to be replaced with measurementmethod(?) e.g. steel tape, eprobe, etc." - sensor.unit = "ft" - sensor.datetime_installed = datetime.now() - session.add(sensor) - session.commit() - - -if __name__ == "__main__": - transfer_sensors("abc") diff --git a/transfers/transfer.py b/transfers/transfer.py index 15c3cc40f..a2d7544a9 100644 --- a/transfers/transfer.py +++ b/transfers/transfer.py @@ -30,12 +30,9 @@ from transfers.group_transfer import transfer_groups from transfers.link_ids_transfer import transfer_link_ids, transfer_link_ids_welldata from transfers.contact_transfer import transfer_contacts -from transfers.sensor_transfer import transfer_sensors +from transfers.sensor_transfer import SensorTransferer from transfers.waterlevels_transfer import transfer_water_levels -from transfers.well_transfer import ( - transfer_wells, - transfer_wellscreens, -) +from transfers.well_transfer import WellTransferer, WellScreenTransferer from transfers.asset_transfer import transfer_assets from transfers.util import timeit, timeit_direct @@ -64,15 +61,15 @@ def transfer_all(sess, metrics, limit=100): "LIMIT": limit, } - results = timeit_direct(transfer_wells, flags=flags) + results = _execute_transfer(WellTransferer, flags=flags) metrics.well_metrics(sess, *results) message("TRANSFERRING WELL SCREENS") - results = timeit_direct(transfer_wellscreens, flags=flags) + results = _execute_transfer(WellScreenTransferer, flags=flags) metrics.well_screen_metrics(sess, *results) message("TRANSFERRING SENSORS") - results = timeit_direct(transfer_sensors, sess) + results = _execute_transfer(SensorTransferer, flags=flags) metrics.sensor_metrics(sess, *results) # Developer's notes all the metadata for these Things are not defined in the models/schemas yet' @@ -125,6 +122,12 @@ def transfer_all(sess, metrics, limit=100): timeit_direct(transfer_assets, sess) +def _execute_transfer(klass, flags: dict = None): + transferer = klass(flags=flags) + transferer.transfer() + return transferer.input_df, transferer.cleaned_df, transferer.errors + + def transfer_debugging(sess, metrics, limit=100): message("STARTING TRANSFER DEBUG", new_line_at_top=False) @@ -134,17 +137,18 @@ def transfer_debugging(sess, metrics, limit=100): message("TRANSFERRING WELLS") - flags = {"TRANSFER_ALL_WELLS": True, "LIMIT": limit} + flags = {"TRANSFER_ALL_WELLS": True, "LIMIT": limit} # not currently used - results = timeit_direct(transfer_wells, flags=flags) + results = _execute_transfer(WellTransferer, flags=flags) metrics.well_metrics(sess, *results) message("TRANSFERRING WELL SCREENS") - results = timeit_direct(transfer_wellscreens, flags=flags) + results = _execute_transfer(WellScreenTransferer, flags=flags) metrics.well_screen_metrics(sess, *results) message("TRANSFERRING SENSORS") - results = timeit_direct(transfer_sensors, sess) + results = _execute_transfer(SensorTransferer, flags=flags) + # results = timeit_direct(transfer_sensors, sess) metrics.sensor_metrics(sess, *results) # Developer's notes all the metadata for these Things are not defined in the models/schemas yet' diff --git a/transfers/transferer.py b/transfers/transferer.py new file mode 100644 index 000000000..273462585 --- /dev/null +++ b/transfers/transferer.py @@ -0,0 +1,189 @@ +# =============================================================================== +# Copyright 2025 ross +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== +import time + +import pandas as pd +from pandas import DataFrame +from sqlalchemy.orm import Session + +from db import Thing +from db.engine import session_ctx +from transfers.logger import logger +from transfers.util import chunk_by_size + + +class Transferer(object): + input_df: pd.DataFrame = None + cleaned_df: pd.DataFrame = None + errors: list = None + flags: dict = None + + def __init__(self, flags: dict = None): + self.errors = [] + self.flags = flags if flags else {} + + def transfer(self): + with session_ctx() as session: + self.input_df, self.cleaned_df = self._get_dfs(session) + self._transfer_hook(session) + session.commit() + + def _transfer_hook(self, session: Session): + self._limit_iterator(session, self.flags.get("LIMIT", 0)) + + def _get_df_to_iterate(self) -> pd.DataFrame: + return self.cleaned_df + + def _limit_iterator(self, session: Session, limit: int, step: int = 25): + df = self._get_df_to_iterate() + n = len(df) + start_time = time.time() + for i, row in enumerate(df.itertuples()): + if limit and i >= limit: + logger.info(f"Reached limit of {limit} rows. Stopping migration.") + break + + if i and not i % step: + logger.info( + f"Processing row {i} of {n}, avg rows per second: {step / (time.time() - start_time):.2f}" + ) + start_time = time.time() + try: + session.commit() + except Exception as e: + logger.critical(f"Error committing wells. {e}") + session.rollback() + continue + + self._iterator(session, df, i, row) + + session.commit() + self._after_hook(session) + + def _iterator(self, session: Session, df: pd.DataFrame, i: int, row: dict): + raise NotImplementedError("Must implement _iterator method") + + def _after_hook(self, session: Session): + pass + + def _get_dfs(self, session: Session): + raise NotImplementedError("Must implement _get_dfs method") + + +class ChunkTransferer(Transferer): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.chunk_size = 1000 + + def _transfer_hook(self, session: Session): + df = self._get_df_to_iterate() + for ci, chunk in enumerate(chunk_by_size(df, self.chunk_size)): + dbchunk = self._get_df_chunk(session, chunk) + logger.info( + f"Processing chunk {ci}, {len(chunk)} rows, {len(dbchunk)} db items" + ) + for i, row in enumerate(chunk.itertuples()): + dbitem = self._get_db_item(dbchunk, row) + if not dbitem: + self._missing_db_item_warning(row) + continue + self._chunk_iterator(session, df, i, row, dbitem) + + # def chunk_transfer(self): + # with session_ctx() as session: + # self.input_df, self.cleaned_df = self._get_dfs(session) + # df = self._get_df_to_iterate() + # for ci, chunk in enumerate(chunk_by_size(df, self.chunk_size)): + # dbchunk = self._get_df_chunk(session, chunk) + # logger.info( + # f"Processing chunk {ci}, {len(chunk)} rows, {len(dbchunk)} db items" + # ) + # for i, row in enumerate(chunk.itertuples()): + # dbitem = self._get_db_item(dbchunk, row) + # if not dbitem: + # self._missing_db_item_warning(row) + # continue + # self._chunk_iterator(session, df, i, row, dbitem) + # session.commit() + + def _get_df_chunk(self, session, chunk): + raise NotImplementedError("Must be implemented in subclass") + + def _missing_db_item_warning(self, row): + raise NotImplementedError("Must be implemented in subclass") + + def _chunk_iterator(self, session, df, i, row, dbitem): + raise NotImplementedError("Must be implemented in subclass") + + def _get_db_item(self, chunk, row): + raise NotImplementedError("Must be implemented in subclass") + + +class GroupTransferer(Transferer): + def _get_group(self): + return self.cleaned_df.groupby(["PointID"]) + + def _transfer_hook(self, session: Session): + self._group_iterator(session) + + def _group_iterator(self, session: Session): + groups = self._get_group() + for index, group in groups: + db_item = self._get_db_item(session, index) + if db_item is None: + logger.warning(self._no_db_item_warning(index)) + continue + + prepped_group = self._get_prepped_group(group) + for row in prepped_group.itertuples(): + try: + self._step(session, row, db_item) + except Exception as e: + import traceback + + pointid = self._get_point_id(row, db_item) + traceback.print_exc() + logger.critical(f"Could not add sensor and deployment: {e}") + self.errors.append( + {"pointid": pointid, "error": e, "table": self.source_table} + ) + + def _get_point_id(self, row, db_item) -> str: + return row.PointID + + def _step(self, session: Session, row, db_item): + raise NotImplementedError("Must be implemented in subclass") + + def _get_prepped_group(self, group) -> DataFrame: + raise NotImplementedError("Must be implemented in subclass") + + def _no_db_item_warning(self, index) -> str: + raise NotImplementedError("Must be implemented in subclass") + + def _get_db_item(self, session, index) -> Thing: + raise NotImplementedError("Must be implemented in subclass") + + +class ThingBasedTransferer(GroupTransferer): + def _get_group(self): + return self.cleaned_df.groupby(["PointID"]) + + def _get_db_item(self, session, index) -> Thing: + pointid = index[0] + return session.query(Thing).filter(Thing.name == pointid).first() + + +# ============= EOF ============================================= diff --git a/transfers/util.py b/transfers/util.py index a74a6a9d0..023d4a397 100644 --- a/transfers/util.py +++ b/transfers/util.py @@ -85,7 +85,9 @@ def estimate_measuring_point_height( "Auto calculated from measurements at depth to water and depth to water below ground surface" ) start_dates.append(start_date) - + logger.info( + f"Estimated MPHeight: {mphs}, {start_dates} for PointID: {row.PointID}." + ) else: mphs = [mph] mph_descs = [mph_desc] @@ -100,9 +102,6 @@ def estimate_measuring_point_height( end_dates = [start_dates[i + 1] for i in range(len(start_dates) - 1)] end_dates.append(None) - logger.info( - f"Estimated MPHeight: {mph}, {start_dates} for PointID: {row.PointID}." - ) return zip(mphs, mph_descs, start_dates, end_dates) diff --git a/transfers/well_transfer.py b/transfers/well_transfer.py index a8a8a22b4..cc049876d 100644 --- a/transfers/well_transfer.py +++ b/transfers/well_transfer.py @@ -14,7 +14,6 @@ # limitations under the License. # =============================================================================== import json -import time from datetime import datetime, UTC import pandas as pd @@ -37,7 +36,6 @@ MonitoringFrequencyHistory, MeasuringPointHistory, ) -from db.engine import session_ctx from schemas.thing import CreateWell, CreateWellScreen from services.gcs_helper import get_storage_bucket from services.util import ( @@ -45,6 +43,7 @@ get_county_from_point, get_quad_name_from_point, ) +from transfers.transferer import ChunkTransferer, Transferer from transfers.util import ( make_location, make_location_data_provenance, @@ -55,7 +54,6 @@ filter_by_welldata_datasource_and_project, lexicon_mapper, filter_non_transferred_wells, - chunk_by_size, MeasuringPointEstimator, ) @@ -122,8 +120,8 @@ def _extract_casing_materials(row) -> list[str]: def get_wells_to_transfer( sess: Session, flags: dict = None ) -> tuple[pd.DataFrame, pd.DataFrame]: - if flags is None: - flags = {} + # if flags is None: + # flags = {} wdf = read_csv("WellData", dtype={"OSEWelltagID": str}) ldf = read_csv("Location") @@ -134,17 +132,19 @@ def get_wells_to_transfer( input_df = wdf wdf = replace_nans(wdf) - if flags.get("TRANSFER_ALL_WELLS", True): - # todo: filter Locations by DataSource - cleaned_df = filter_by_welldata_datasource_and_project(wdf) - else: - # get a subset of wells that have not been transferred yet - # todo: this needs to be defined. - # for now, we are just filtering out wells that have not been transferred yet - # In the future we will be using criteria to determine which wells to transfer - # for example, wells in the "Water Level Network" project - cleaned_df = wdf + # if flags.get("TRANSFER_ALL_WELLS", False): + # # todo: filter Locations by DataSource + # cleaned_df = filter_by_welldata_datasource_and_project(wdf) + # else: + # # get a subset of wells that have not been transferred yet + # # todo: this needs to be defined. + # # for now, we are just filtering out wells that have not been transferred yet + # # In the future we will be using criteria to determine which wells to transfer + # # for example, wells in the "Water Level Network" project + # cleaned_df = wdf + + cleaned_df = filter_by_welldata_datasource_and_project(wdf) cleaned_df = filter_non_transferred_wells(sess, cleaned_df) return input_df, cleaned_df @@ -168,60 +168,6 @@ def dump_cached_elevations(lut: dict): blob.upload_from_string(json.dumps(lut)) -class Transferer(object): - input_df: pd.DataFrame = None - cleaned_df: pd.DataFrame = None - errors: list = None - flags: dict = None - - def __init__(self, flags: dict = None): - self.errors = [] - self.flags = flags if flags else {} - - def transfer(self): - with session_ctx() as session: - self.input_df, self.cleaned_df = self._get_dfs(session) - self._limit_iterator(session, self.flags.get("LIMIT", 0)) - - def _get_df_to_iterate(self) -> pd.DataFrame: - return self.cleaned_df - - def _limit_iterator(self, session: Session, limit: int, step: int = 25): - df = self._get_df_to_iterate() - n = len(df) - start_time = time.time() - for i, row in enumerate(df.itertuples()): - if limit and i >= limit: - logger.info(f"Reached limit of {limit} rows. Stopping migration.") - break - - if i and not i % step: - logger.info( - f"Processing row {i} of {n}, avg rows per second: {step / (time.time() - start_time):.2f}" - ) - start_time = time.time() - try: - session.commit() - except Exception as e: - logger.critical(f"Error committing wells. {e}") - session.rollback() - continue - - self._iterator(session, df, i, row) - - session.commit() - self._after_hook(session) - - def _iterator(self, session: Session, df: pd.DataFrame, i: int, row: dict): - raise NotImplementedError("Must implement _iterator method") - - def _after_hook(self, session: Session): - pass - - def _get_dfs(self, session: Session): - raise NotImplementedError("Must implement _get_dfs method") - - class WellTransferer(Transferer): source_table = "WellData" @@ -472,41 +418,6 @@ def _after_hook(self, session): session.commit() -class ChunkTransferer(Transferer): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.chunk_size = 1000 - - def chunk_transfer(self): - with session_ctx() as session: - self.input_df, self.cleaned_df = self._get_dfs(session) - df = self._get_df_to_iterate() - for ci, chunk in enumerate(chunk_by_size(df, self.chunk_size)): - dbchunk = self._get_df_chunk(session, chunk) - logger.info( - f"Processing chunk {ci}, {len(chunk)} rows, {len(dbchunk)} db items" - ) - for i, row in enumerate(chunk.itertuples()): - dbitem = self._get_db_item(dbchunk, row) - if not dbitem: - self._missing_db_item_warning(row) - continue - self._chunk_iterator(session, df, i, row, dbitem) - session.commit() - - def _get_df_chunk(self, session, chunk): - raise NotImplementedError("Must be implemented in subclass") - - def _missing_db_item_warning(self, row): - raise NotImplementedError("Must be implemented in subclass") - - def _chunk_iterator(self, session, df, i, row, dbitem): - raise NotImplementedError("Must be implemented in subclass") - - def _get_db_item(self, chunk, row): - raise NotImplementedError("Must be implemented in subclass") - - class WellScreenTransferer(ChunkTransferer): def _get_dfs(self, session: Session): input_df = read_csv("WellScreens") @@ -552,16 +463,16 @@ def _chunk_iterator(self, session, df, i, row, db_item): session.add(well_screen) -def transfer_wells(flags: dict = None): - transferer = WellTransferer(flags=flags) - transferer.transfer() - return transferer.input_df, transferer.cleaned_df, transferer.errors - - -def transfer_wellscreens(flags: dict = None): - transferer = WellScreenTransferer(flags=flags) - transferer.chunk_transfer() - return transferer.input_df, transferer.cleaned_df, transferer.errors +# def transfer_wells(flags: dict = None): +# transferer = WellTransferer(flags=flags) +# transferer.transfer() +# return transferer.input_df, transferer.cleaned_df, transferer.errors +# +# +# def transfer_wellscreens(flags: dict = None): +# transferer = WellScreenTransferer(flags=flags) +# transferer.chunk_transfer() +# return transferer.input_df, transferer.cleaned_df, transferer.errors def cleanup_locations(session): @@ -624,314 +535,3 @@ def cleanup_locations(session): # ============= EOF ============================================= -# def transfer_wells_old(session: Session, flags: dict = None, limit: int = 0) -> None: -# # input_df, cleaned_df = get_wells_to_transfer(session, flags) -# # wdf = cleaned_df -# # n = len(wdf) -# -# # step = 25 -# # start_time = time.time() -# errors = [] -# added_locations = {} -# # cached_elevations = get_cached_elevations() -# # for i, row in enumerate(wdf.itertuples()): -# # pointid = row.PointID -# # if wdf[wdf["PointID"] == pointid].shape[0] > 1: -# # logger.critical( -# # f"transfer_wells. PointID {pointid} has duplicate records. Skipping." -# # ) -# # errors.append( -# # { -# # "pointid": pointid, -# # "error": "duplicate records", -# # "table": source_table, -# # "field": "PointID", -# # } -# # ) -# # continue -# -# # if limit and i >= limit: -# # logger.info(f"Reached limit of {limit} rows. Stopping migration.") -# # break -# # -# # if i and not i % step: -# # logger.info( -# # f"Processing row {i} of {n}, avg rows per second: {step / (time.time() - start_time):.2f}" -# # ) -# # start_time = time.time() -# # try: -# # session.commit() -# # except Exception as e: -# # logger.critical(f"Error committing wells. {e}") -# # session.rollback() -# # continue -# -# # location = None -# # try: -# # location, elevation_method = make_location(row, cached_elevations) -# # session.add(location) -# # added_locations[row.PointID] = elevation_method -# # except Exception as e: -# # if location is not None: -# # session.expunge(location) -# # # these rollbacks are cause an issue because they are discarding good data -# # # session.rollback() -# # errors.append( -# # { -# # "pointid": row.PointID, -# # "error": e, -# # "table": "Location", -# # "field": str(e), -# # } -# # ) -# # logger.critical(f"Error making location for {row.PointID}: {e}") -# # continue -# # -# # try: -# # first_visit_date = _get_first_visit_date(row) -# # well_purposes = [] if isna(row.CurrentUse) else _extract_well_purposes(row) -# # well_casing_materials = ( -# # [] if isna(row.CasingDescription) else _extract_casing_materials(row) -# # ) -# # -# # # manually add the well rather than add_well from services/thing_helper.py -# # # so that effective_start can be set on the location assocation -# # -# # data = CreateWell( -# # location_id=location.id, -# # name=row.PointID, -# # first_visit_date=first_visit_date, -# # hole_depth=row.HoleDepth, -# # well_depth=row.WellDepth, -# # well_construction_notes=row.ConstructionNotes, -# # well_casing_diameter=( -# # row.CasingDiameter * 12 if row.CasingDiameter else None -# # ), -# # well_casing_depth=row.CasingDepth, -# # release_status="public" if row.PublicRelease else "private", -# # measuring_point_height=row.MPHeight, -# # measuring_point_description=row.MeasuringPoint, -# # notes=( -# # [{"content": row.Notes, "note_type": "Other"}] if row.Notes else [] -# # ), -# # ) -# # -# # CreateWell.model_validate(data) -# # except ValidationError as e: -# # errors.append({"pointid": row.PointID, "error": e, "table": "WellData"}) -# # logger.critical( -# # f"Validation error for row {i} with PointID {row.PointID}: {e.errors()}" -# # ) -# # continue -# # -# # well = None -# # try: -# # well_data = data.model_dump( -# # exclude=[ -# # "location_id", -# # "group_id", -# # "well_purposes", -# # "well_casing_materials", -# # "measuring_point_height", -# # "measuring_point_description", -# # ] -# # ) -# # well_data["thing_type"] = "water well" -# # well_data["nma_pk_welldata"] = row.WellID -# # -# # well_data.pop("notes") -# # well = Thing(**well_data) -# # session.add(well) -# # # logger.info(f"Created well for {row.PointID}") -# # -# # # flush well to access its ID for status_history -# # # session.flush() -# # -# # # session.commit() -# # # session.refresh(well) -# # # if notes: -# # # for ni in notes: -# # # nn = well.add_note(ni['content'], ni['note_type']) -# # # session.add(nn) -# # -# # if well_purposes: -# # for wp in well_purposes: -# # # TODO: add validation logic here -# # if wp in WellPurposeEnum: -# # wp_obj = WellPurpose(thing=well, purpose=wp) -# # session.add(wp_obj) -# # else: -# # logger.critical(f"{well.name}. Invalid well purpose: {wp}") -# # -# # if well_casing_materials: -# # for wcm in well_casing_materials: -# # # TODO: add validation logic here -# # if wcm in WellCasingMaterialEnum: -# # wcm_obj = WellCasingMaterial(thing=well, material=wcm) -# # session.add(wcm_obj) -# # else: -# # logger.critical( -# # f"{well.name}. Invalid well casing material: {wcm}" -# # ) -# # except Exception as e: -# # if well is not None: -# # session.expunge(well) -# # -# # errors.append({"pointid": row.PointID, "error": e, "table": "WellData"}) -# # logger.critical(f"Error creating well for {row.PointID}: {e}") -# # continue -# # -# # assoc = LocationThingAssociation(effective_start=location.created_at) -# # -# # assoc.location = location -# # assoc.thing = well -# # session.add(assoc) -# -# # session.commit() -# -# # # add things thate need well id -# # for well in session.query(Thing).filter(Thing.thing_type == "water well").all(): -# # row = wdf[wdf["PointID"] == well.name].iloc[0] -# # if not isna(row.Notes): -# # note = well.add_note(row.Notes, "Other") -# # session.add(note) -# # -# # location = well.current_location -# # elevation_method = added_locations[row.PointID] -# # data_provenances = make_location_data_provenance( -# # row, location, elevation_method -# # ) -# # for dp in data_provenances: -# # session.add(dp) -# # -# # """ -# # Developer's note -# # -# # It's not clear when the measuring point from NM_Aquifer was -# # determined, so I'm setting start_date to the day of the transfer -# # """ -# # measuring_point_history = MeasuringPointHistory( -# # thing_id=well.id, -# # measuring_point_height=row.MPHeight, -# # measuring_point_description=row.MeasuringPoint, -# # start_date=datetime.now(tz=UTC), -# # end_date=None, -# # ) -# # session.add(measuring_point_history) -# # -# # """ -# # Developer's notes -# # -# # For all status_history records the start_date will be now since that -# # isn't recorded in NM_Aquifer -# # """ -# # # TODO: if row.MonitoringStatus == "Q" is it monitored or not? <-- AMMP review -# # # TODO: if row.MonitoringStatus == "X" can that change? <-- AMMP review -# # # TODO: have AMMP review and verify the various MonitoringStatus codes -# # -# # target_id = well.id -# # target_table = "thing" -# # if not isna(row.MonitoringStatus): -# # if ( -# # "X" in row.MonitoringStatus -# # or "I" in row.MonitoringStatus -# # or "C" in row.MonitoringStatus -# # ): -# # status_value = "Not currently monitored" -# # else: -# # status_value = "Currently monitored" -# # -# # status_history = StatusHistory( -# # status_type="Monitoring Status", -# # status_value=status_value, -# # reason=row.MonitorStatusReason, -# # start_date=datetime.now(tz=UTC), -# # target_id=target_id, -# # target_table=target_table, -# # ) -# # session.add(status_history) -# # logger.info( -# # f" Added monitoring status for well {well.name}: {status_value}" -# # ) -# # -# # for code in NMA_MONITORING_FREQUENCY.keys(): -# # if code in row.MonitoringStatus: -# # monitoring_frequency = NMA_MONITORING_FREQUENCY[code] -# # monitoring_frequency_history = MonitoringFrequencyHistory( -# # thing_id=well.id, -# # monitoring_frequency=monitoring_frequency, -# # start_date=datetime.now(tz=UTC), -# # end_date=None, -# # ) -# # session.add(monitoring_frequency_history) -# # logger.info( -# # f" Adding '{monitoring_frequency}' monitoring frequency for well {well.name}" -# # ) -# # -# # if not isna(row.Status): -# # status_value = lexicon_mapper.map_value(f"LU_Status:{row.Status}") -# # status_history = StatusHistory( -# # status_type="Well Status", -# # status_value=status_value, -# # reason=row.StatusUserNotes, -# # start_date=datetime.now(tz=UTC), -# # target_id=target_id, -# # target_table=target_table, -# # ) -# # session.add(status_history) -# # logger.info(f" Added well status for well {well.name}: {status_value}") -# # -# # session.commit() -# # -# # dump_cached_elevations(cached_elevations) -# # return input_df, cleaned_df, errors - -# def transfer_wellscreens_old(session, limit=None): - -# input_df = read_csv("WellScreens") -# wdf = replace_nans(input_df) -# -# cleaned_df = filter_to_valid_point_ids(session, wdf) - -# errors = [] -# for ci, chunk in enumerate(chunk_by_size(cleaned_df, 1000)): -# things = ( -# session.query(Thing).filter(Thing.name.in_(chunk.PointID.tolist())).all() -# ) -# -# logger.info(f"Processing chunk {ci}, {len(chunk)} rows, {len(things)} things") -# for i, row in enumerate(chunk.itertuples()): -# thing = next((thing for thing in things if thing.name == row.PointID), None) -# if not thing: -# logger.warning( -# f"Thing with PointID {row.PointID} not found. Skipping well screen." -# ) -# continue -# -# well_screen_data = { -# "thing_id": thing.id, -# "screen_depth_top": row.ScreenTop, -# "screen_depth_bottom": row.ScreenBottom, -# # "screen_type": row.ScreenType, -# "screen_description": row.ScreenDescription, -# "release_status": "draft", -# "nma_pk_wellscreens": row.GlobalID, -# } -# try: -# # TODO: add validation logic here to ensure no overlapping screens for the same well -# CreateWellScreen.model_validate(well_screen_data) -# except ValidationError as e: -# logger.critical( -# f"Validation error for row {i} with PointID {row.PointID}: {e.errors()}" -# ) -# errors.append( -# {"pointid": row.PointID, "error": e, "table": "WellScreens"} -# ) -# continue -# -# well_screen = WellScreen(**well_screen_data) -# session.add(well_screen) -# -# session.commit() -# -# return input_df, cleaned_df, errors From 9432f8849e99b888ac1030b4a900ff328c13aa21 Mon Sep 17 00:00:00 2001 From: Kimball Bighorse Date: Wed, 26 Nov 2025 16:31:32 -0800 Subject: [PATCH 07/66] Unify read csv approaches --- .gitignore | 1 + transfers/util.py | 14 ++++++++++++++ 2 files changed, 15 insertions(+) diff --git a/.gitignore b/.gitignore index c1d8db1ee..f1bd9dd54 100644 --- a/.gitignore +++ b/.gitignore @@ -25,6 +25,7 @@ launcher.sh gcs_credentials.json transfers/data/assets* transfers/data/nma_csv_cache/* +transfers/data/*.csv transfers/transfer*.log transfer*.log transfers/data/nma_csv_cache/* diff --git a/transfers/util.py b/transfers/util.py index cbf0f2b17..590c9252d 100644 --- a/transfers/util.py +++ b/transfers/util.py @@ -59,10 +59,24 @@ def replace_nans(df: pd.DataFrame, default=None) -> pd.DataFrame: def read_csv(name: str, dtype: dict | None = None) -> pd.DataFrame: + # Try to read from local data directory first + local_file = Path(__file__).parent / 'data' / f"{name}.csv" + + if local_file.exists(): + logger.info(f"Reading {name} from local file: {local_file}") + if dtype: + return pd.read_csv(local_file, dtype=dtype) + else: + return pd.read_csv(local_file) + + # Check cache directory p = get_transfers_data_path(Path("nma_csv_cache") / f"{name}.csv") if os.path.exists(p): + logger.info(f"Reading {name} from cache: {p}") return pd.read_csv(p, dtype=dtype) + # Fall back to GCS if local file doesn't exist + logger.info(f"Local file and cache not found, reading {name} from GCS") bucket = get_storage_bucket() blob = bucket.blob(f"nma_csv/{name}.csv") data = blob.download_as_bytes() From 5db6964799f93c102c8f28851f84c0e3af69e3de Mon Sep 17 00:00:00 2001 From: kbighorse Date: Thu, 27 Nov 2025 00:31:24 +0000 Subject: [PATCH 08/66] Formatting changes --- transfers/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transfers/util.py b/transfers/util.py index 590c9252d..d08798425 100644 --- a/transfers/util.py +++ b/transfers/util.py @@ -60,7 +60,7 @@ def replace_nans(df: pd.DataFrame, default=None) -> pd.DataFrame: def read_csv(name: str, dtype: dict | None = None) -> pd.DataFrame: # Try to read from local data directory first - local_file = Path(__file__).parent / 'data' / f"{name}.csv" + local_file = Path(__file__).parent / "data" / f"{name}.csv" if local_file.exists(): logger.info(f"Reading {name} from local file: {local_file}") From fe6f50ccf91825676a8d57fbd574ce85dd6819ee Mon Sep 17 00:00:00 2001 From: Kimball Bighorse Date: Wed, 26 Nov 2025 16:55:38 -0800 Subject: [PATCH 09/66] Un-ignore features; add features for location and well dates --- .gitignore | 1 - tests/features/location-legacy-dates.feature | 57 +++++++++++++++++ tests/features/well-completion-date.feature | 64 ++++++++++++++++++++ 3 files changed, 121 insertions(+), 1 deletion(-) create mode 100644 tests/features/location-legacy-dates.feature create mode 100644 tests/features/well-completion-date.feature diff --git a/.gitignore b/.gitignore index f1bd9dd54..44b28e13c 100644 --- a/.gitignore +++ b/.gitignore @@ -30,7 +30,6 @@ transfers/transfer*.log transfer*.log transfers/data/nma_csv_cache/* !transfers/data/nma_csv_cache/.gitkeep -tests/features/*.feature transfers/metrics/* transfers/logs/* run_bdd-local.sh diff --git a/tests/features/location-legacy-dates.feature b/tests/features/location-legacy-dates.feature new file mode 100644 index 000000000..1486d9edc --- /dev/null +++ b/tests/features/location-legacy-dates.feature @@ -0,0 +1,57 @@ +Feature: Location Legacy Date Fields + As a data manager + I want to preserve legacy date information from the AMPAPI system + So that historical temporal context is not lost during migration + + Background: + Given a functioning api + + Scenario: Create location with both legacy dates + When I create a location with legacy_date_created "2014-10-17" and inventoried_on "2003-12-10" + Then the response should include legacy_date_created as "2014-10-17" + And the response should include inventoried_on as "2003-12-10" + And the created_at timestamp should be the current system time + And the time gap between inventoried_on and legacy_date_created should be preserved + + Scenario: Create location with only legacy_date_created + When I create a location with legacy_date_created "2014-10-17" + Then the response should include legacy_date_created as "2014-10-17" + And the response should include inventoried_on as null + And the created_at timestamp should be the current system time + + Scenario: Create location with only inventoried_on + When I create a location with inventoried_on "2003-12-10" + Then the response should include inventoried_on as "2003-12-10" + And the response should include legacy_date_created as null + And the created_at timestamp should be the current system time + + Scenario: Create location with neither legacy date + When I create a location without legacy dates + Then the response should include legacy_date_created as null + And the response should include inventoried_on as null + And the created_at timestamp should be the current system time + + Scenario: Update location legacy dates + Given a location exists with legacy_date_created "2014-10-17" + When I update the location to add inventoried_on "2003-12-10" + Then the response should include legacy_date_created as "2014-10-17" + And the response should include inventoried_on as "2003-12-10" + + Scenario: Retrieve location with legacy dates via GET + Given a location exists with legacy_date_created "2014-10-17" and inventoried_on "2003-12-10" + When I retrieve the location by ID + Then the response should include legacy_date_created as "2014-10-17" + And the response should include inventoried_on as "2003-12-10" + + Scenario: Historical data preservation - 54 year gap (Site SM-0227) + When I create a location with legacy_date_created "2008-05-28" and inventoried_on "1954-05-01" + Then the response should include legacy_date_created as "2008-05-28" + And the response should include inventoried_on as "1954-05-01" + And the time gap should be approximately 19751 days + + Scenario: List locations includes legacy dates + Given multiple locations exist with various legacy dates + When I retrieve all locations + Then each location should include legacy_date_created field + And each location should include inventoried_on field + And the fields should be null for locations without legacy dates diff --git a/tests/features/well-completion-date.feature b/tests/features/well-completion-date.feature new file mode 100644 index 000000000..54f211ef5 --- /dev/null +++ b/tests/features/well-completion-date.feature @@ -0,0 +1,64 @@ +Feature: Well Completion Date + As a hydrogeologist + I want to track when wells were completed/constructed + So that I can analyze well age and relate construction standards to time periods + + Background: + Given a functioning api + + Scenario: Create water well with completion date + When I create a water well with well_completed_on "2004-08-08" + Then the response should include well_completed_on as "2004-08-08" + And the response should have thing_type "water well" + + Scenario: Create water well without completion date + When I create a water well without well_completed_on + Then the response should include well_completed_on as null + And the well should be created successfully + + Scenario: Update well to add completion date + Given a water well exists without well_completed_on + When I update the well to add well_completed_on "2004-08-08" + Then the response should include well_completed_on as "2004-08-08" + + Scenario: Update well to change completion date + Given a water well exists with well_completed_on "2004-08-08" + When I update the well to change well_completed_on to "2005-03-15" + Then the response should include well_completed_on as "2005-03-15" + + Scenario: Historical well from 1936 + When I create a water well with well_completed_on "1936-01-01" + Then the response should include well_completed_on as "1936-01-01" + And the well age should be over 88 years + + Scenario: Retrieve well with completion date via GET + Given a water well exists with well_completed_on "2004-08-08" + When I retrieve the well by ID + Then the response should include well_completed_on as "2004-08-08" + And the response should include the well's age in years + + Scenario: List wells includes completion dates + Given multiple wells exist with various completion dates + When I retrieve all water wells + Then each well should include well_completed_on field + And the field should be null for wells without completion dates + + Scenario: Spring does not have completion date + When I create a spring + Then the response should include well_completed_on as null + And the spring should be created successfully + + Scenario: Filter wells by completion date range + Given wells exist with completion dates ranging from 1936 to 2024 + When I filter wells completed between "2000-01-01" and "2010-12-31" + Then the response should only include wells completed in that range + And wells from 1936 should not be included + And wells from 2020 should not be included + + Scenario: Well completion date with location legacy dates + When I create a water well with well_completed_on "2004-08-08" + And the well's location has legacy_date_created "2014-10-17" and inventoried_on "2013-05-01" + Then the well should have well_completed_on as "2004-08-08" + And the location should have legacy_date_created as "2014-10-17" + And the location should have inventoried_on as "2013-05-01" + And all three date fields should be independently queryable From 738c1ef123120dca01ce9cb86ac234a594b9f7af Mon Sep 17 00:00:00 2001 From: Kimball Bighorse Date: Wed, 26 Nov 2025 17:04:36 -0800 Subject: [PATCH 10/66] Remove features we won't keep --- tests/features/location-legacy-dates.feature | 57 ----------------- tests/features/well-completion-date.feature | 64 -------------------- 2 files changed, 121 deletions(-) delete mode 100644 tests/features/location-legacy-dates.feature delete mode 100644 tests/features/well-completion-date.feature diff --git a/tests/features/location-legacy-dates.feature b/tests/features/location-legacy-dates.feature deleted file mode 100644 index 1486d9edc..000000000 --- a/tests/features/location-legacy-dates.feature +++ /dev/null @@ -1,57 +0,0 @@ -Feature: Location Legacy Date Fields - As a data manager - I want to preserve legacy date information from the AMPAPI system - So that historical temporal context is not lost during migration - - Background: - Given a functioning api - - Scenario: Create location with both legacy dates - When I create a location with legacy_date_created "2014-10-17" and inventoried_on "2003-12-10" - Then the response should include legacy_date_created as "2014-10-17" - And the response should include inventoried_on as "2003-12-10" - And the created_at timestamp should be the current system time - And the time gap between inventoried_on and legacy_date_created should be preserved - - Scenario: Create location with only legacy_date_created - When I create a location with legacy_date_created "2014-10-17" - Then the response should include legacy_date_created as "2014-10-17" - And the response should include inventoried_on as null - And the created_at timestamp should be the current system time - - Scenario: Create location with only inventoried_on - When I create a location with inventoried_on "2003-12-10" - Then the response should include inventoried_on as "2003-12-10" - And the response should include legacy_date_created as null - And the created_at timestamp should be the current system time - - Scenario: Create location with neither legacy date - When I create a location without legacy dates - Then the response should include legacy_date_created as null - And the response should include inventoried_on as null - And the created_at timestamp should be the current system time - - Scenario: Update location legacy dates - Given a location exists with legacy_date_created "2014-10-17" - When I update the location to add inventoried_on "2003-12-10" - Then the response should include legacy_date_created as "2014-10-17" - And the response should include inventoried_on as "2003-12-10" - - Scenario: Retrieve location with legacy dates via GET - Given a location exists with legacy_date_created "2014-10-17" and inventoried_on "2003-12-10" - When I retrieve the location by ID - Then the response should include legacy_date_created as "2014-10-17" - And the response should include inventoried_on as "2003-12-10" - - Scenario: Historical data preservation - 54 year gap (Site SM-0227) - When I create a location with legacy_date_created "2008-05-28" and inventoried_on "1954-05-01" - Then the response should include legacy_date_created as "2008-05-28" - And the response should include inventoried_on as "1954-05-01" - And the time gap should be approximately 19751 days - - Scenario: List locations includes legacy dates - Given multiple locations exist with various legacy dates - When I retrieve all locations - Then each location should include legacy_date_created field - And each location should include inventoried_on field - And the fields should be null for locations without legacy dates diff --git a/tests/features/well-completion-date.feature b/tests/features/well-completion-date.feature deleted file mode 100644 index 54f211ef5..000000000 --- a/tests/features/well-completion-date.feature +++ /dev/null @@ -1,64 +0,0 @@ -Feature: Well Completion Date - As a hydrogeologist - I want to track when wells were completed/constructed - So that I can analyze well age and relate construction standards to time periods - - Background: - Given a functioning api - - Scenario: Create water well with completion date - When I create a water well with well_completed_on "2004-08-08" - Then the response should include well_completed_on as "2004-08-08" - And the response should have thing_type "water well" - - Scenario: Create water well without completion date - When I create a water well without well_completed_on - Then the response should include well_completed_on as null - And the well should be created successfully - - Scenario: Update well to add completion date - Given a water well exists without well_completed_on - When I update the well to add well_completed_on "2004-08-08" - Then the response should include well_completed_on as "2004-08-08" - - Scenario: Update well to change completion date - Given a water well exists with well_completed_on "2004-08-08" - When I update the well to change well_completed_on to "2005-03-15" - Then the response should include well_completed_on as "2005-03-15" - - Scenario: Historical well from 1936 - When I create a water well with well_completed_on "1936-01-01" - Then the response should include well_completed_on as "1936-01-01" - And the well age should be over 88 years - - Scenario: Retrieve well with completion date via GET - Given a water well exists with well_completed_on "2004-08-08" - When I retrieve the well by ID - Then the response should include well_completed_on as "2004-08-08" - And the response should include the well's age in years - - Scenario: List wells includes completion dates - Given multiple wells exist with various completion dates - When I retrieve all water wells - Then each well should include well_completed_on field - And the field should be null for wells without completion dates - - Scenario: Spring does not have completion date - When I create a spring - Then the response should include well_completed_on as null - And the spring should be created successfully - - Scenario: Filter wells by completion date range - Given wells exist with completion dates ranging from 1936 to 2024 - When I filter wells completed between "2000-01-01" and "2010-12-31" - Then the response should only include wells completed in that range - And wells from 1936 should not be included - And wells from 2020 should not be included - - Scenario: Well completion date with location legacy dates - When I create a water well with well_completed_on "2004-08-08" - And the well's location has legacy_date_created "2014-10-17" and inventoried_on "2013-05-01" - Then the well should have well_completed_on as "2004-08-08" - And the location should have legacy_date_created as "2014-10-17" - And the location should have inventoried_on as "2013-05-01" - And all three date fields should be independently queryable From 953263252428153889933ffe74ecbd97ca133109 Mon Sep 17 00:00:00 2001 From: kbighorse Date: Thu, 27 Nov 2025 01:12:56 +0000 Subject: [PATCH 11/66] Formatting changes --- .../steps/post_migration_legacy_data.py | 200 +++++++++++------- 1 file changed, 129 insertions(+), 71 deletions(-) diff --git a/tests/features/steps/post_migration_legacy_data.py b/tests/features/steps/post_migration_legacy_data.py index dca15d638..e78afbde7 100644 --- a/tests/features/steps/post_migration_legacy_data.py +++ b/tests/features/steps/post_migration_legacy_data.py @@ -27,6 +27,7 @@ def parse_number(text): return int(text) + register_type(Number=parse_number) @@ -91,14 +92,21 @@ def step_given_data_migrated(context: Context): @given("a location exists with") def step_given_location_with_table(context: Context): """Create location with fields from table.""" - data = {row['field']: row['value'] for row in context.table} + data = {row["field"]: row["value"] for row in context.table} - legacy_date_created = date.fromisoformat(data['legacy_date_created']) if data.get('legacy_date_created') and data['legacy_date_created'] != 'null' else None - inventoried_on = date.fromisoformat(data['inventoried_on']) if data.get('inventoried_on') and data['inventoried_on'] != 'null' else None + legacy_date_created = ( + date.fromisoformat(data["legacy_date_created"]) + if data.get("legacy_date_created") and data["legacy_date_created"] != "null" + else None + ) + inventoried_on = ( + date.fromisoformat(data["inventoried_on"]) + if data.get("inventoried_on") and data["inventoried_on"] != "null" + else None + ) location = create_test_location( - legacy_date_created=legacy_date_created, - inventoried_on=inventoried_on + legacy_date_created=legacy_date_created, inventoried_on=inventoried_on ) context.test_location = location @@ -122,12 +130,16 @@ def step_given_multiple_locations(context: Context, count: int): legacy_date, inventory_date = test_data[i] location = create_test_location( legacy_date_created=date.fromisoformat(legacy_date), - inventoried_on=date.fromisoformat(inventory_date) if inventory_date else None + inventoried_on=( + date.fromisoformat(inventory_date) if inventory_date else None + ), ) context.test_locations.append(location) -@given("locations exist with inventoried_on ranging from {start_year:Number} to {end_year:Number}") +@given( + "locations exist with inventoried_on ranging from {start_year:Number} to {end_year:Number}" +) def step_given_locations_date_range(context: Context, start_year: int, end_year: int): """Create locations with inventoried_on across a date range.""" context.test_locations = [] @@ -136,15 +148,17 @@ def step_given_locations_date_range(context: Context, start_year: int, end_year: for year in years: location = create_test_location( legacy_date_created=date(year + 5, 1, 1), # Always 5 years after inventory - inventoried_on=date(year, 6, 15) + inventoried_on=date(year, 6, 15), ) context.test_locations.append(location) @given('{count:Number} locations exist with legacy_date_created "{target_date}"') -def step_given_locations_with_specific_date(context: Context, count: int, target_date: str): +def step_given_locations_with_specific_date( + context: Context, count: int, target_date: str +): """Create locations with specific legacy_date_created.""" - if not hasattr(context, 'test_locations'): + if not hasattr(context, "test_locations"): context.test_locations = [] target = date.fromisoformat(target_date) @@ -152,7 +166,7 @@ def step_given_locations_with_specific_date(context: Context, count: int, target for i in range(count): location = create_test_location( legacy_date_created=target, - inventoried_on=date(2000 + i, 1, 1) # Vary the inventory dates + inventoried_on=date(2000 + i, 1, 1), # Vary the inventory dates ) context.test_locations.append(location) @@ -160,7 +174,9 @@ def step_given_locations_with_specific_date(context: Context, count: int, target @given('a well exists with well_completed_on "{completion_date}"') def step_given_well_with_completion(context: Context, completion_date: str): """Create well with completion date.""" - completed_on = date.fromisoformat(completion_date) if completion_date != 'null' else None + completed_on = ( + date.fromisoformat(completion_date) if completion_date != "null" else None + ) thing, location = create_test_well(well_completed_on=completed_on) @@ -185,7 +201,9 @@ def step_given_multiple_wells(context: Context, count: int): ] for i in range(min(count, len(completion_dates))): - completed_on = date.fromisoformat(completion_dates[i]) if completion_dates[i] else None + completed_on = ( + date.fromisoformat(completion_dates[i]) if completion_dates[i] else None + ) thing, location = create_test_well(well_completed_on=completed_on) context.test_wells.append(thing) @@ -197,7 +215,9 @@ def step_given_wells_with_null_completion(context: Context, null_count: int): pass -@given("wells exist with completion dates from {start_year:Number} to {end_year:Number}") +@given( + "wells exist with completion dates from {start_year:Number} to {end_year:Number}" +) def step_given_wells_date_range(context: Context, start_year: int, end_year: int): """Create wells with completion dates across range.""" context.test_wells = [] @@ -213,7 +233,7 @@ def step_given_wells_specific_years(context: Context, years: str): """Create wells with specific completion years.""" context.test_wells = [] - year_list = [int(y.strip()) for y in years.split(',')] + year_list = [int(y.strip()) for y in years.split(",")] for year in year_list: thing, location = create_test_well(well_completed_on=date(year, 6, 15)) @@ -223,7 +243,7 @@ def step_given_wells_specific_years(context: Context, years: str): @given("some wells have null well_completed_on") def step_given_some_wells_null(context: Context): """Add wells without completion dates.""" - if not hasattr(context, 'test_wells'): + if not hasattr(context, "test_wells"): context.test_wells = [] for i in range(2): @@ -234,10 +254,18 @@ def step_given_some_wells_null(context: Context): @given("that well's location has") def step_given_well_location_has_table(context: Context): """Set legacy dates on the well's location.""" - data = {row['field']: row['value'] for row in context.table} + data = {row["field"]: row["value"] for row in context.table} - legacy_date_created = date.fromisoformat(data.get('legacy_date_created')) if data.get('legacy_date_created') else None - inventoried_on = date.fromisoformat(data.get('inventoried_on')) if data.get('inventoried_on') else None + legacy_date_created = ( + date.fromisoformat(data.get("legacy_date_created")) + if data.get("legacy_date_created") + else None + ) + inventoried_on = ( + date.fromisoformat(data.get("inventoried_on")) + if data.get("inventoried_on") + else None + ) with session_ctx() as session: location = session.get(Location, context.test_well_location.id) @@ -255,11 +283,11 @@ def step_given_count_locations_migrated(context: Context, count: int): for i in range(count): # 9% have inventoried_on - has_inventory = (i < count * 0.09) + has_inventory = i < count * 0.09 location = create_test_location( legacy_date_created=date(2014, 1, i % 28 + 1), - inventoried_on=date(2003, 1, i % 28 + 1) if has_inventory else None + inventoried_on=date(2003, 1, i % 28 + 1) if has_inventory else None, ) context.test_locations.append(location) @@ -277,7 +305,7 @@ def step_given_count_wells_migrated(context: Context, count: int): for i in range(count): # 30% have completion dates - has_completion = (i < count * 0.30) + has_completion = i < count * 0.30 thing, location = create_test_well( well_completed_on=date(2000 + (i % 24), 1, 1) if has_completion else None @@ -295,8 +323,7 @@ def step_given_completion_count(context: Context, count: int): def step_given_location_migrated_with_dates(context: Context): """Create location with both legacy dates.""" location = create_test_location( - legacy_date_created=date(2014, 4, 3), - inventoried_on=date(2002, 12, 10) + legacy_date_created=date(2014, 4, 3), inventoried_on=date(2002, 12, 10) ) context.test_location = location @@ -319,6 +346,7 @@ def step_given_well_null_completion(context: Context): # WHEN steps + @when("I retrieve that location via the API") def step_when_retrieve_location_api(context: Context): """Retrieve location via GET API.""" @@ -335,7 +363,9 @@ def step_when_get_all_locations(context: Context): context.locations_response = response.json() -@when('I filter locations where inventoried_on is between "{start_date}" and "{end_date}"') +@when( + 'I filter locations where inventoried_on is between "{start_date}" and "{end_date}"' +) def step_when_filter_locations(context: Context, start_date: str, end_date: str): """Filter locations by date range.""" # Since API may not support this yet, query database directly @@ -343,10 +373,11 @@ def step_when_filter_locations(context: Context, start_date: str, end_date: str) start = date.fromisoformat(start_date) end = date.fromisoformat(end_date) - locations = session.query(Location).filter( - Location.inventoried_on >= start, - Location.inventoried_on <= end - ).all() + locations = ( + session.query(Location) + .filter(Location.inventoried_on >= start, Location.inventoried_on <= end) + .all() + ) context.filtered_locations = locations @@ -356,9 +387,9 @@ def step_when_query_by_legacy_date(context: Context, target_date: str): """Query locations by legacy_date_created.""" with session_ctx() as session: target = date.fromisoformat(target_date) - locations = session.query(Location).filter( - Location.legacy_date_created == target - ).all() + locations = ( + session.query(Location).filter(Location.legacy_date_created == target).all() + ) context.queried_locations = locations @@ -378,18 +409,24 @@ def step_when_get_all_wells(context: Context): context.wells_response = response.json() -@when('I filter wells where well_completed_on is between "{start_date}" and "{end_date}"') +@when( + 'I filter wells where well_completed_on is between "{start_date}" and "{end_date}"' +) def step_when_filter_wells(context: Context, start_date: str, end_date: str): """Filter wells by completion date range.""" with session_ctx() as session: start = date.fromisoformat(start_date) end = date.fromisoformat(end_date) - wells = session.query(Thing).filter( - Thing.thing_type == "water well", - Thing.well_completed_on >= start, - Thing.well_completed_on <= end - ).all() + wells = ( + session.query(Thing) + .filter( + Thing.thing_type == "water well", + Thing.well_completed_on >= start, + Thing.well_completed_on <= end, + ) + .all() + ) context.filtered_wells = wells @@ -398,9 +435,12 @@ def step_when_filter_wells(context: Context, start_date: str, end_date: str): def step_when_get_wells_sorted(context: Context): """Get wells sorted by completion date.""" with session_ctx() as session: - wells = session.query(Thing).filter( - Thing.thing_type == "water well" - ).order_by(Thing.well_completed_on.asc().nullslast()).all() + wells = ( + session.query(Thing) + .filter(Thing.thing_type == "water well") + .order_by(Thing.well_completed_on.asc().nullslast()) + .all() + ) context.sorted_wells = wells @@ -461,6 +501,7 @@ def step_when_retrieve_well(context: Context): # THEN steps + @then('the response should include legacy_date_created as "{expected_date}"') def step_then_legacy_date_created(context: Context, expected_date: str): """Assert legacy_date_created matches.""" @@ -492,8 +533,9 @@ def step_then_time_gap_years(context: Context, years: str): expected_years = float(years) tolerance = 0.5 - assert abs(gap_years - expected_years) < tolerance, \ - f"Expected ~{expected_years} year gap, got {gap_years:.1f} years" + assert ( + abs(gap_years - expected_years) < tolerance + ), f"Expected ~{expected_years} year gap, got {gap_years:.1f} years" @then("each location should have a legacy_date_created field") @@ -524,24 +566,27 @@ def step_then_some_null_inventory(context: Context): def step_then_locations_in_decade(context: Context): """Assert filtered locations are in range.""" for loc in context.filtered_locations: - assert 2000 <= loc.inventoried_on.year <= 2010, \ - f"Location not in 2000-2010: {loc.inventoried_on}" + assert ( + 2000 <= loc.inventoried_on.year <= 2010 + ), f"Location not in 2000-2010: {loc.inventoried_on}" @then("locations inventoried before {year:Number} should not be included") def step_then_locations_before_excluded(context: Context, year: int): """Assert no locations before year.""" for loc in context.filtered_locations: - assert loc.inventoried_on.year >= year, \ - f"Location from {loc.inventoried_on.year} should not be included" + assert ( + loc.inventoried_on.year >= year + ), f"Location from {loc.inventoried_on.year} should not be included" @then("locations inventoried after {year:Number} should not be included") def step_then_locations_after_excluded(context: Context, year: int): """Assert no locations after year.""" for loc in context.filtered_locations: - assert loc.inventoried_on.year <= year, \ - f"Location from {loc.inventoried_on.year} should not be included" + assert ( + loc.inventoried_on.year <= year + ), f"Location from {loc.inventoried_on.year} should not be included" @then("the response should include exactly {count:Number} locations") @@ -556,8 +601,9 @@ def step_then_all_have_date(context: Context, expected_date: str): """Assert all have same date.""" expected = date.fromisoformat(expected_date) for loc in context.queried_locations: - assert loc.legacy_date_created == expected, \ - f"Location has {loc.legacy_date_created}, expected {expected}" + assert ( + loc.legacy_date_created == expected + ), f"Location has {loc.legacy_date_created}, expected {expected}" @then('the response should include well_completed_on as "{expected_date}"') @@ -610,8 +656,9 @@ def step_then_percentage_populated(context: Context, percentage: int): actual_pct = (populated / total) * 100 tolerance = 10 - assert abs(actual_pct - percentage) < tolerance, \ - f"Expected ~{percentage}%, got {actual_pct:.1f}%" + assert ( + abs(actual_pct - percentage) < tolerance + ), f"Expected ~{percentage}%, got {actual_pct:.1f}%" @then("the response should only include wells completed in that decade") @@ -650,11 +697,13 @@ def step_then_nulls_last(context: Context): """Assert nulls at end.""" first_null_idx = next( (i for i, w in enumerate(context.sorted_wells) if w.well_completed_on is None), - len(context.sorted_wells) + len(context.sorted_wells), ) for well in context.sorted_wells[first_null_idx:]: - assert well.well_completed_on is None, "Found non-null after null in sorted list" + assert ( + well.well_completed_on is None + ), "Found non-null after null in sorted list" @then('the well should have well_completed_on as "{expected_date}"') @@ -680,15 +729,21 @@ def step_then_location_has_inventory(context: Context, expected_date: str): assert actual == expected_date, f"Expected {expected_date}, got {actual}" -@then("the temporal sequence should be: well_completed_on → inventoried_on → legacy_date_created") +@then( + "the temporal sequence should be: well_completed_on → inventoried_on → legacy_date_created" +) def step_then_temporal_sequence(context: Context): """Assert temporal order.""" well_completed = context.retrieved_well.well_completed_on inventoried = context.retrieved_location.inventoried_on legacy_created = context.retrieved_location.legacy_date_created - assert well_completed < inventoried, "Well should be completed before site inventoried" - assert inventoried < legacy_created, "Site should be inventoried before DB record created" + assert ( + well_completed < inventoried + ), "Well should be completed before site inventoried" + assert ( + inventoried < legacy_created + ), "Site should be inventoried before DB record created" @then("the timeline should show: {year1:Number} → {year2:Number} → {year3:Number}") @@ -707,8 +762,9 @@ def step_then_percentage_inventory(context: Context, percentage: int): actual_pct = (populated / total) * 100 tolerance = 2 - assert abs(actual_pct - percentage) < tolerance, \ - f"Expected ~{percentage}%, got {actual_pct:.1f}%" + assert ( + abs(actual_pct - percentage) < tolerance + ), f"Expected ~{percentage}%, got {actual_pct:.1f}%" @then("{percentage:Number}% should have non-null legacy_date_created") @@ -719,8 +775,9 @@ def step_then_percentage_legacy(context: Context, percentage: int): actual_pct = (populated / total) * 100 tolerance = 2 - assert abs(actual_pct - percentage) < tolerance, \ - f"Expected ~{percentage}%, got {actual_pct:.1f}%" + assert ( + abs(actual_pct - percentage) < tolerance + ), f"Expected ~{percentage}%, got {actual_pct:.1f}%" @then("{percentage:Number}% should have non-null well_completed_on") @@ -731,8 +788,9 @@ def step_then_percentage_completion(context: Context, percentage: int): actual_pct = (populated / total) * 100 tolerance = 2 - assert abs(actual_pct - percentage) < tolerance, \ - f"Expected ~{percentage}%, got {actual_pct:.1f}%" + assert ( + abs(actual_pct - percentage) < tolerance + ), f"Expected ~{percentage}%, got {actual_pct:.1f}%" @then("it should have created_at (new system timestamp from migration)") @@ -756,9 +814,9 @@ def step_then_has_inventory_date(context: Context): @then("all three timestamps should be independently queryable") def step_then_all_queryable(context: Context): """Assert all fields are queryable.""" - assert hasattr(context.retrieved_location, 'created_at') - assert hasattr(context.retrieved_location, 'legacy_date_created') - assert hasattr(context.retrieved_location, 'inventoried_on') + assert hasattr(context.retrieved_location, "created_at") + assert hasattr(context.retrieved_location, "legacy_date_created") + assert hasattr(context.retrieved_location, "inventoried_on") @then("created_at should be a recent timestamp") @@ -803,17 +861,17 @@ def step_then_no_error(context: Context): @then("well_completed_on should be null") def step_then_completion_null(context: Context): """Assert well_completed_on is null.""" - if hasattr(context, 'retrieved_thing'): + if hasattr(context, "retrieved_thing"): assert context.retrieved_thing.well_completed_on is None - elif hasattr(context, 'retrieved_well'): + elif hasattr(context, "retrieved_well"): assert context.retrieved_well.well_completed_on is None @then("the field should exist in the response schema") def step_then_field_exists_in_schema(context: Context): """Assert field exists in schema.""" - if hasattr(context, 'retrieved_thing'): - assert hasattr(context.retrieved_thing, 'well_completed_on') + if hasattr(context, "retrieved_thing"): + assert hasattr(context.retrieved_thing, "well_completed_on") @then("it should not cause validation errors") From ac04b26af2638ed7a59d06a70942437efaca7537 Mon Sep 17 00:00:00 2001 From: Kimball Bighorse Date: Wed, 26 Nov 2025 17:13:06 -0800 Subject: [PATCH 12/66] Add features that describe post-migration behaviors --- ...st-migration-legacy-data-retrieval.feature | 172 ++++ .../steps/post_migration_legacy_data.py | 837 ++++++++++++++++++ 2 files changed, 1009 insertions(+) create mode 100644 tests/features/post-migration-legacy-data-retrieval.feature create mode 100644 tests/features/steps/post_migration_legacy_data.py diff --git a/tests/features/post-migration-legacy-data-retrieval.feature b/tests/features/post-migration-legacy-data-retrieval.feature new file mode 100644 index 000000000..69d2c5506 --- /dev/null +++ b/tests/features/post-migration-legacy-data-retrieval.feature @@ -0,0 +1,172 @@ +Feature: Post-Migration Legacy Data Retrieval + As a data manager + After migrating data from AMPAPI to NMSampleLocations + I want to verify that all legacy temporal information is preserved and queryable + So that no historical context is lost + + Background: + Given a functioning api + And the AMPAPI data has been migrated to the database + + # Location Legacy Date Lookups + + Scenario: Retrieve location with both legacy dates via API + Given a location exists with: + | field | value | + | legacy_date_created | 2014-04-03 | + | inventoried_on | 2002-12-10 | + When I retrieve that location via the API + Then the response should include legacy_date_created as "2014-04-03" + And the response should include inventoried_on as "2002-12-10" + And the time gap should be approximately 11.3 years + + Scenario: Retrieve location with large time gap (54 years) + Given a location exists with: + | field | value | + | legacy_date_created | 2008-05-28 | + | inventoried_on | 1954-05-01 | + When I retrieve that location via the API + Then the response should include legacy_date_created as "2008-05-28" + And the response should include inventoried_on as "2002-12-10" + And the time gap should be approximately 54 years + + Scenario: List all locations includes legacy date fields + Given 5 locations exist with various legacy dates + When I GET /location to list all locations + Then each location should have a legacy_date_created field + And each location should have an inventoried_on field + And some locations should have null inventoried_on + + Scenario: Filter locations by inventory date range + Given locations exist with inventoried_on ranging from 1950 to 2024 + When I filter locations where inventoried_on is between "2000-01-01" and "2010-12-31" + Then the response should only include locations inventoried in that decade + And locations inventoried before 2000 should not be included + And locations inventoried after 2010 should not be included + + Scenario: Query location by legacy_date_created + Given 3 locations exist with legacy_date_created "2014-04-03" + And 2 locations exist with legacy_date_created "2017-12-06" + When I query for locations with legacy_date_created "2014-04-03" + Then the response should include exactly 3 locations + And all should have legacy_date_created "2014-04-03" + + # Well Completion Date Lookups + + Scenario: Retrieve well with completion date via API + Given a well exists with well_completed_on "2004-08-08" + When I retrieve that well via the API + Then the response should include well_completed_on as "2004-08-08" + And the well age should be calculable + + Scenario: Retrieve old well from early 1900s + Given a well exists with well_completed_on "1936-01-01" + When I retrieve that well via the API + Then the response should include well_completed_on as "1936-01-01" + And the well should be over 88 years old + + Scenario: List all wells includes completion date field + Given 10 wells exist with various completion dates + And 3 of those wells have null well_completed_on + When I GET /thing/water-well to list all wells + Then each well should have a well_completed_on field + And 70% of wells should have well_completed_on populated + + Scenario: Filter wells by completion date range + Given wells exist with completion dates from 1936 to 2024 + When I filter wells where well_completed_on is between "2000-01-01" and "2010-12-31" + Then the response should only include wells completed in that decade + And wells from 1936 should not be included + And wells from 2020 should not be included + + Scenario: Sort wells by completion date (oldest first) + Given wells exist with completion dates: 1936, 1965, 2004, 2020 + And some wells have null well_completed_on + When I GET /thing/water-well sorted by well_completed_on ascending + Then the first well should be from 1936 + And the last well with a date should be from 2020 + And wells without completion dates should appear last + + # Combined Queries - Location + Well Legacy Dates + + Scenario: Retrieve well with location showing all legacy dates + Given a well exists with well_completed_on "2004-08-08" + And that well's location has: + | field | value | + | legacy_date_created | 2014-04-03 | + | inventoried_on | 2002-12-10 | + When I retrieve the well via the API + Then the well should have well_completed_on as "2004-08-08" + And the current_location should include legacy_date_created as "2014-04-03" + And the current_location should include inventoried_on as "2002-12-10" + + Scenario: Timeline reconstruction - well completed before site inventoried + Given a well exists with well_completed_on "1995-06-15" + And that well's location has: + | field | value | + | inventoried_on | 2003-12-10 | + | legacy_date_created | 2014-04-03 | + When I retrieve the well and its location + Then the temporal sequence should be: well_completed_on → inventoried_on → legacy_date_created + And the timeline should show: 1995 → 2003 → 2014 + + # Data Quality Validation + + Scenario: Verify migration preserved expected percentage of legacy dates + Given 100 locations were migrated + And 9 of them had non-null SiteDate in AMPAPI + When I query the migrated locations + Then 9% should have non-null inventoried_on + And 100% should have non-null legacy_date_created + + Scenario: Verify well completion date coverage matches expectation + Given 100 wells were migrated + And 30 of them had non-null CompletionDate in AMPAPI + When I query the migrated wells + Then 30% should have non-null well_completed_on + + # Audit Trail Verification + + Scenario: Legacy dates preserved alongside audit timestamps + Given a location was migrated with legacy dates + When I retrieve that location + Then it should have created_at (new system timestamp from migration) + And it should have legacy_date_created (original AMPAPI DateCreated) + And it should have inventoried_on (original AMPAPI SiteDate) + And all three timestamps should be independently queryable + And created_at should be a recent timestamp + And legacy_date_created should be an older date + + # Edge Cases + + Scenario: Location where SiteDate is later than DateCreated (data anomaly) + Given a location exists with: + | field | value | + | legacy_date_created | 2010-01-15 | + | inventoried_on | 2015-06-20 | + When I retrieve that location + Then legacy_date_created should be "2010-01-15" + And inventoried_on should be "2015-06-20" + And the system should accept this without error + + Scenario: Spring does not use well_completed_on field + Given a thing of type "spring" exists + When I retrieve that spring + Then well_completed_on should be null + And the field should exist in the response schema + And it should not cause validation errors + + Scenario: Location with only legacy_date_created (no inventoried_on) + Given a location exists with: + | field | value | + | legacy_date_created | 2014-10-17 | + | inventoried_on | null | + When I retrieve that location + Then legacy_date_created should be "2014-10-17" + And inventoried_on should be null + + Scenario: Well without completion date + Given a well exists with well_completed_on null + When I retrieve that well + Then well_completed_on should be null + And the well should still be valid diff --git a/tests/features/steps/post_migration_legacy_data.py b/tests/features/steps/post_migration_legacy_data.py new file mode 100644 index 000000000..dca15d638 --- /dev/null +++ b/tests/features/steps/post_migration_legacy_data.py @@ -0,0 +1,837 @@ +# =============================================================================== +# Copyright 2025 ross +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== +from datetime import date, datetime +from behave import given, when, then, register_type +from behave.runner import Context +import parse + +from db import Location, Thing, LocationThingAssociation +from db.engine import session_ctx + + +# Custom type parsers +@parse.with_pattern(r"\d+") +def parse_number(text): + return int(text) + +register_type(Number=parse_number) + + +def create_test_location(legacy_date_created=None, inventoried_on=None): + """Helper to create a test location with legacy dates.""" + with session_ctx() as session: + location = Location( + point="POINT(-106.607784 35.118924)", + elevation=1558.8, + release_status="public", + legacy_date_created=legacy_date_created, + inventoried_on=inventoried_on, + ) + session.add(location) + session.commit() + session.refresh(location) + return location + + +def create_test_well(well_completed_on=None, thing_type="water well"): + """Helper to create a test well with completion date.""" + with session_ctx() as session: + # Create location + location = Location( + point="POINT(-106.607784 35.118924)", + elevation=1558.8, + release_status="public", + ) + session.add(location) + session.commit() + + # Create thing + thing = Thing( + name=f"Test-{thing_type}-{datetime.now().timestamp()}", + first_visit_date="2023-03-03", + thing_type=thing_type, + release_status="public", + well_depth=100.0 if thing_type == "water well" else None, + hole_depth=110.0 if thing_type == "water well" else None, + well_completed_on=well_completed_on, + ) + session.add(thing) + session.commit() + + # Associate + assoc = LocationThingAssociation(location=location, thing=thing) + assoc.effective_start = "2000-01-01T00:00:00Z" + session.add(assoc) + session.commit() + + session.refresh(thing) + session.refresh(location) + return thing, location + + +@given("the AMPAPI data has been migrated to the database") +def step_given_data_migrated(context: Context): + """Assumption that migration has occurred.""" + context.migrated = True + + +@given("a location exists with") +def step_given_location_with_table(context: Context): + """Create location with fields from table.""" + data = {row['field']: row['value'] for row in context.table} + + legacy_date_created = date.fromisoformat(data['legacy_date_created']) if data.get('legacy_date_created') and data['legacy_date_created'] != 'null' else None + inventoried_on = date.fromisoformat(data['inventoried_on']) if data.get('inventoried_on') and data['inventoried_on'] != 'null' else None + + location = create_test_location( + legacy_date_created=legacy_date_created, + inventoried_on=inventoried_on + ) + + context.test_location = location + context.test_location_id = location.id + + +@given("{count:Number} locations exist with various legacy dates") +def step_given_multiple_locations(context: Context, count: int): + """Create multiple locations with various legacy dates.""" + context.test_locations = [] + + test_data = [ + ("2014-04-03", "2002-12-10"), + ("2014-04-03", "2003-01-07"), + ("2017-12-06", "2003-12-11"), + ("2008-05-28", "1954-05-01"), + ("2020-01-15", None), + ] + + for i in range(min(count, len(test_data))): + legacy_date, inventory_date = test_data[i] + location = create_test_location( + legacy_date_created=date.fromisoformat(legacy_date), + inventoried_on=date.fromisoformat(inventory_date) if inventory_date else None + ) + context.test_locations.append(location) + + +@given("locations exist with inventoried_on ranging from {start_year:Number} to {end_year:Number}") +def step_given_locations_date_range(context: Context, start_year: int, end_year: int): + """Create locations with inventoried_on across a date range.""" + context.test_locations = [] + + years = [1954, 2002, 2003, 2010, 2015, 2020, 2024] + for year in years: + location = create_test_location( + legacy_date_created=date(year + 5, 1, 1), # Always 5 years after inventory + inventoried_on=date(year, 6, 15) + ) + context.test_locations.append(location) + + +@given('{count:Number} locations exist with legacy_date_created "{target_date}"') +def step_given_locations_with_specific_date(context: Context, count: int, target_date: str): + """Create locations with specific legacy_date_created.""" + if not hasattr(context, 'test_locations'): + context.test_locations = [] + + target = date.fromisoformat(target_date) + + for i in range(count): + location = create_test_location( + legacy_date_created=target, + inventoried_on=date(2000 + i, 1, 1) # Vary the inventory dates + ) + context.test_locations.append(location) + + +@given('a well exists with well_completed_on "{completion_date}"') +def step_given_well_with_completion(context: Context, completion_date: str): + """Create well with completion date.""" + completed_on = date.fromisoformat(completion_date) if completion_date != 'null' else None + + thing, location = create_test_well(well_completed_on=completed_on) + + context.test_well = thing + context.test_well_id = thing.id + context.test_well_location = location + + +@given("{count:Number} wells exist with various completion dates") +def step_given_multiple_wells(context: Context, count: int): + """Create multiple wells with various completion dates.""" + context.test_wells = [] + + completion_dates = [ + "1936-01-01", + "1965-06-15", + "2004-08-08", + "2020-05-15", + None, # No completion date + None, + None, + ] + + for i in range(min(count, len(completion_dates))): + completed_on = date.fromisoformat(completion_dates[i]) if completion_dates[i] else None + thing, location = create_test_well(well_completed_on=completed_on) + context.test_wells.append(thing) + + +@given("{null_count:Number} of those wells have null well_completed_on") +def step_given_wells_with_null_completion(context: Context, null_count: int): + """Verify expected number of nulls (declarative - already created).""" + # Wells were created in previous step with nulls + pass + + +@given("wells exist with completion dates from {start_year:Number} to {end_year:Number}") +def step_given_wells_date_range(context: Context, start_year: int, end_year: int): + """Create wells with completion dates across range.""" + context.test_wells = [] + + years = [1936, 1965, 2004, 2010, 2020, 2024] + for year in years: + thing, location = create_test_well(well_completed_on=date(year, 6, 15)) + context.test_wells.append(thing) + + +@given("wells exist with completion dates: {years}") +def step_given_wells_specific_years(context: Context, years: str): + """Create wells with specific completion years.""" + context.test_wells = [] + + year_list = [int(y.strip()) for y in years.split(',')] + + for year in year_list: + thing, location = create_test_well(well_completed_on=date(year, 6, 15)) + context.test_wells.append(thing) + + +@given("some wells have null well_completed_on") +def step_given_some_wells_null(context: Context): + """Add wells without completion dates.""" + if not hasattr(context, 'test_wells'): + context.test_wells = [] + + for i in range(2): + thing, location = create_test_well(well_completed_on=None) + context.test_wells.append(thing) + + +@given("that well's location has") +def step_given_well_location_has_table(context: Context): + """Set legacy dates on the well's location.""" + data = {row['field']: row['value'] for row in context.table} + + legacy_date_created = date.fromisoformat(data.get('legacy_date_created')) if data.get('legacy_date_created') else None + inventoried_on = date.fromisoformat(data.get('inventoried_on')) if data.get('inventoried_on') else None + + with session_ctx() as session: + location = session.get(Location, context.test_well_location.id) + location.legacy_date_created = legacy_date_created + location.inventoried_on = inventoried_on + session.commit() + session.refresh(location) + context.test_well_location = location + + +@given("{count:Number} locations were migrated") +def step_given_count_locations_migrated(context: Context, count: int): + """Create specified number of test locations.""" + context.test_locations = [] + + for i in range(count): + # 9% have inventoried_on + has_inventory = (i < count * 0.09) + + location = create_test_location( + legacy_date_created=date(2014, 1, i % 28 + 1), + inventoried_on=date(2003, 1, i % 28 + 1) if has_inventory else None + ) + context.test_locations.append(location) + + +@given("{count:Number} of them had non-null SiteDate in AMPAPI") +def step_given_sitedate_count(context: Context, count: int): + """Declarative - data created in previous step.""" + pass + + +@given("{count:Number} wells were migrated") +def step_given_count_wells_migrated(context: Context, count: int): + """Create specified number of test wells.""" + context.test_wells = [] + + for i in range(count): + # 30% have completion dates + has_completion = (i < count * 0.30) + + thing, location = create_test_well( + well_completed_on=date(2000 + (i % 24), 1, 1) if has_completion else None + ) + context.test_wells.append(thing) + + +@given("{count:Number} of them had non-null CompletionDate in AMPAPI") +def step_given_completion_count(context: Context, count: int): + """Declarative - data created in previous step.""" + pass + + +@given("a location was migrated with legacy dates") +def step_given_location_migrated_with_dates(context: Context): + """Create location with both legacy dates.""" + location = create_test_location( + legacy_date_created=date(2014, 4, 3), + inventoried_on=date(2002, 12, 10) + ) + context.test_location = location + + +@given('a thing of type "{thing_type}" exists') +def step_given_thing_of_type(context: Context, thing_type: str): + """Create a thing of specified type.""" + thing, location = create_test_well(well_completed_on=None, thing_type=thing_type) + context.test_thing = thing + context.test_thing_id = thing.id + + +@given("a well exists with well_completed_on null") +def step_given_well_null_completion(context: Context): + """Create well without completion date.""" + thing, location = create_test_well(well_completed_on=None) + context.test_well = thing + context.test_well_id = thing.id + + +# WHEN steps + +@when("I retrieve that location via the API") +def step_when_retrieve_location_api(context: Context): + """Retrieve location via GET API.""" + response = context.client.get(f"/location/{context.test_location_id}") + assert response.status_code == 200 + context.location_response = response.json() + + +@when("I GET /location to list all locations") +def step_when_get_all_locations(context: Context): + """Get all locations.""" + response = context.client.get("/location") + assert response.status_code == 200 + context.locations_response = response.json() + + +@when('I filter locations where inventoried_on is between "{start_date}" and "{end_date}"') +def step_when_filter_locations(context: Context, start_date: str, end_date: str): + """Filter locations by date range.""" + # Since API may not support this yet, query database directly + with session_ctx() as session: + start = date.fromisoformat(start_date) + end = date.fromisoformat(end_date) + + locations = session.query(Location).filter( + Location.inventoried_on >= start, + Location.inventoried_on <= end + ).all() + + context.filtered_locations = locations + + +@when('I query for locations with legacy_date_created "{target_date}"') +def step_when_query_by_legacy_date(context: Context, target_date: str): + """Query locations by legacy_date_created.""" + with session_ctx() as session: + target = date.fromisoformat(target_date) + locations = session.query(Location).filter( + Location.legacy_date_created == target + ).all() + context.queried_locations = locations + + +@when("I retrieve that well via the API") +def step_when_retrieve_well_api(context: Context): + """Retrieve well via GET API.""" + response = context.client.get(f"/thing/water-well/{context.test_well_id}") + assert response.status_code == 200 + context.well_response = response.json() + + +@when("I GET /thing/water-well to list all wells") +def step_when_get_all_wells(context: Context): + """Get all wells.""" + response = context.client.get("/thing/water-well") + assert response.status_code == 200 + context.wells_response = response.json() + + +@when('I filter wells where well_completed_on is between "{start_date}" and "{end_date}"') +def step_when_filter_wells(context: Context, start_date: str, end_date: str): + """Filter wells by completion date range.""" + with session_ctx() as session: + start = date.fromisoformat(start_date) + end = date.fromisoformat(end_date) + + wells = session.query(Thing).filter( + Thing.thing_type == "water well", + Thing.well_completed_on >= start, + Thing.well_completed_on <= end + ).all() + + context.filtered_wells = wells + + +@when("I GET /thing/water-well sorted by well_completed_on ascending") +def step_when_get_wells_sorted(context: Context): + """Get wells sorted by completion date.""" + with session_ctx() as session: + wells = session.query(Thing).filter( + Thing.thing_type == "water well" + ).order_by(Thing.well_completed_on.asc().nullslast()).all() + + context.sorted_wells = wells + + +@when("I retrieve the well and its location") +def step_when_retrieve_well_and_location(context: Context): + """Retrieve well with location.""" + with session_ctx() as session: + well = session.get(Thing, context.test_well.id) + location = session.get(Location, context.test_well_location.id) + + context.retrieved_well = well + context.retrieved_location = location + + +@when("I query the migrated locations") +def step_when_query_migrated_locations(context: Context): + """Query all test locations.""" + with session_ctx() as session: + # Query only our test locations + location_ids = [loc.id for loc in context.test_locations] + locations = session.query(Location).filter(Location.id.in_(location_ids)).all() + context.queried_locations = locations + + +@when("I query the migrated wells") +def step_when_query_migrated_wells(context: Context): + """Query all test wells.""" + with session_ctx() as session: + well_ids = [well.id for well in context.test_wells] + wells = session.query(Thing).filter(Thing.id.in_(well_ids)).all() + context.queried_wells = wells + + +@when("I retrieve that location") +def step_when_retrieve_location(context: Context): + """Retrieve location by ID.""" + with session_ctx() as session: + location = session.get(Location, context.test_location.id) + context.retrieved_location = location + + +@when("I retrieve that spring") +def step_when_retrieve_spring(context: Context): + """Retrieve spring/thing by ID.""" + with session_ctx() as session: + thing = session.get(Thing, context.test_thing.id) + context.retrieved_thing = thing + + +@when("I retrieve that well") +def step_when_retrieve_well(context: Context): + """Retrieve well by ID.""" + with session_ctx() as session: + well = session.get(Thing, context.test_well.id) + context.retrieved_well = well + + +# THEN steps + +@then('the response should include legacy_date_created as "{expected_date}"') +def step_then_legacy_date_created(context: Context, expected_date: str): + """Assert legacy_date_created matches.""" + actual = context.location_response.get("legacy_date_created") + assert actual == expected_date, f"Expected {expected_date}, got {actual}" + + +@then('the response should include inventoried_on as "{expected_date}"') +def step_then_inventoried_on(context: Context, expected_date: str): + """Assert inventoried_on matches.""" + actual = context.location_response.get("inventoried_on") + assert actual == expected_date, f"Expected {expected_date}, got {actual}" + + +@then("the time gap should be approximately {years} years") +def step_then_time_gap_years(context: Context, years: str): + """Assert approximate year gap.""" + legacy_str = context.location_response.get("legacy_date_created") + inventory_str = context.location_response.get("inventoried_on") + + if not legacy_str or not inventory_str: + raise AssertionError("Missing date fields for gap calculation") + + legacy_date = date.fromisoformat(legacy_str) + inventory_date = date.fromisoformat(inventory_str) + + gap_days = (legacy_date - inventory_date).days + gap_years = gap_days / 365.25 + + expected_years = float(years) + tolerance = 0.5 + assert abs(gap_years - expected_years) < tolerance, \ + f"Expected ~{expected_years} year gap, got {gap_years:.1f} years" + + +@then("each location should have a legacy_date_created field") +def step_then_all_have_legacy_field(context: Context): + """Assert all locations have the field.""" + items = context.locations_response.get("items", []) + for item in items: + assert "legacy_date_created" in item, f"Location missing legacy_date_created" + + +@then("each location should have an inventoried_on field") +def step_then_all_have_inventory_field(context: Context): + """Assert all locations have the field.""" + items = context.locations_response.get("items", []) + for item in items: + assert "inventoried_on" in item, f"Location missing inventoried_on" + + +@then("some locations should have null inventoried_on") +def step_then_some_null_inventory(context: Context): + """Assert some locations have null.""" + items = context.locations_response.get("items", []) + null_count = sum(1 for item in items if item.get("inventoried_on") is None) + assert null_count > 0, "Expected at least one location with null inventoried_on" + + +@then("the response should only include locations inventoried in that decade") +def step_then_locations_in_decade(context: Context): + """Assert filtered locations are in range.""" + for loc in context.filtered_locations: + assert 2000 <= loc.inventoried_on.year <= 2010, \ + f"Location not in 2000-2010: {loc.inventoried_on}" + + +@then("locations inventoried before {year:Number} should not be included") +def step_then_locations_before_excluded(context: Context, year: int): + """Assert no locations before year.""" + for loc in context.filtered_locations: + assert loc.inventoried_on.year >= year, \ + f"Location from {loc.inventoried_on.year} should not be included" + + +@then("locations inventoried after {year:Number} should not be included") +def step_then_locations_after_excluded(context: Context, year: int): + """Assert no locations after year.""" + for loc in context.filtered_locations: + assert loc.inventoried_on.year <= year, \ + f"Location from {loc.inventoried_on.year} should not be included" + + +@then("the response should include exactly {count:Number} locations") +def step_then_exact_count_locations(context: Context, count: int): + """Assert exact count.""" + actual = len(context.queried_locations) + assert actual == count, f"Expected {count} locations, got {actual}" + + +@then('all should have legacy_date_created "{expected_date}"') +def step_then_all_have_date(context: Context, expected_date: str): + """Assert all have same date.""" + expected = date.fromisoformat(expected_date) + for loc in context.queried_locations: + assert loc.legacy_date_created == expected, \ + f"Location has {loc.legacy_date_created}, expected {expected}" + + +@then('the response should include well_completed_on as "{expected_date}"') +def step_then_well_completed_on(context: Context, expected_date: str): + """Assert well_completed_on matches.""" + actual = context.well_response.get("well_completed_on") + assert actual == expected_date, f"Expected {expected_date}, got {actual}" + + +@then("the well age should be calculable") +def step_then_age_calculable(context: Context): + """Assert age can be calculated.""" + completion_str = context.well_response.get("well_completed_on") + assert completion_str is not None, "Cannot calculate age without completion date" + + completed = date.fromisoformat(completion_str) + today = date.today() + age_years = (today - completed).days / 365.25 + assert age_years >= 0, "Age cannot be negative" + + +@then("the well should be over {min_age:Number} years old") +def step_then_well_over_age(context: Context, min_age: int): + """Assert well age exceeds minimum.""" + completion_str = context.well_response.get("well_completed_on") + completed = date.fromisoformat(completion_str) + today = date.today() + age_years = (today - completed).days / 365.25 + + assert age_years >= min_age, f"Expected over {min_age} years, got {age_years:.1f}" + + +@then("each well should have a well_completed_on field") +def step_then_all_wells_have_field(context: Context): + """Assert all wells have the field.""" + items = context.wells_response.get("items", []) + for item in items: + assert "well_completed_on" in item, f"Well missing well_completed_on" + + +@then("{percentage:Number}% of wells should have well_completed_on populated") +def step_then_percentage_populated(context: Context, percentage: int): + """Assert approximate percentage.""" + items = context.wells_response.get("items", []) + total = len(items) + if total == 0: + return + + populated = sum(1 for item in items if item.get("well_completed_on") is not None) + actual_pct = (populated / total) * 100 + + tolerance = 10 + assert abs(actual_pct - percentage) < tolerance, \ + f"Expected ~{percentage}%, got {actual_pct:.1f}%" + + +@then("the response should only include wells completed in that decade") +def step_then_wells_in_decade(context: Context): + """Assert filtered wells in range.""" + for well in context.filtered_wells: + assert 2000 <= well.well_completed_on.year <= 2010 + + +@then("wells from {year:Number} should not be included") +def step_then_wells_year_excluded(context: Context, year: int): + """Assert wells from year excluded.""" + for well in context.filtered_wells: + assert well.well_completed_on.year != year + + +@then("the first well should be from {year:Number}") +def step_then_first_well_year(context: Context, year: int): + """Assert first well year.""" + if context.sorted_wells and context.sorted_wells[0].well_completed_on: + actual_year = context.sorted_wells[0].well_completed_on.year + assert actual_year == year, f"Expected {year}, got {actual_year}" + + +@then("the last well with a date should be from {year:Number}") +def step_then_last_well_year(context: Context, year: int): + """Assert last non-null well year.""" + non_null = [w for w in context.sorted_wells if w.well_completed_on] + if non_null: + actual_year = non_null[-1].well_completed_on.year + assert actual_year == year, f"Expected {year}, got {actual_year}" + + +@then("wells without completion dates should appear last") +def step_then_nulls_last(context: Context): + """Assert nulls at end.""" + first_null_idx = next( + (i for i, w in enumerate(context.sorted_wells) if w.well_completed_on is None), + len(context.sorted_wells) + ) + + for well in context.sorted_wells[first_null_idx:]: + assert well.well_completed_on is None, "Found non-null after null in sorted list" + + +@then('the well should have well_completed_on as "{expected_date}"') +def step_then_well_has_completion(context: Context, expected_date: str): + """Assert well has completion date.""" + actual = context.well_response.get("well_completed_on") + assert actual == expected_date, f"Expected {expected_date}, got {actual}" + + +@then('the current_location should include legacy_date_created as "{expected_date}"') +def step_then_location_has_legacy(context: Context, expected_date: str): + """Assert location has legacy_date_created.""" + current_location = context.well_response.get("current_location", {}) + actual = current_location.get("legacy_date_created") + assert actual == expected_date, f"Expected {expected_date}, got {actual}" + + +@then('the current_location should include inventoried_on as "{expected_date}"') +def step_then_location_has_inventory(context: Context, expected_date: str): + """Assert location has inventoried_on.""" + current_location = context.well_response.get("current_location", {}) + actual = current_location.get("inventoried_on") + assert actual == expected_date, f"Expected {expected_date}, got {actual}" + + +@then("the temporal sequence should be: well_completed_on → inventoried_on → legacy_date_created") +def step_then_temporal_sequence(context: Context): + """Assert temporal order.""" + well_completed = context.retrieved_well.well_completed_on + inventoried = context.retrieved_location.inventoried_on + legacy_created = context.retrieved_location.legacy_date_created + + assert well_completed < inventoried, "Well should be completed before site inventoried" + assert inventoried < legacy_created, "Site should be inventoried before DB record created" + + +@then("the timeline should show: {year1:Number} → {year2:Number} → {year3:Number}") +def step_then_timeline_years(context: Context, year1: int, year2: int, year3: int): + """Assert specific years in sequence.""" + assert context.retrieved_well.well_completed_on.year == year1 + assert context.retrieved_location.inventoried_on.year == year2 + assert context.retrieved_location.legacy_date_created.year == year3 + + +@then("{percentage:Number}% should have non-null inventoried_on") +def step_then_percentage_inventory(context: Context, percentage: int): + """Assert percentage with inventoried_on.""" + total = len(context.queried_locations) + populated = sum(1 for loc in context.queried_locations if loc.inventoried_on) + actual_pct = (populated / total) * 100 + + tolerance = 2 + assert abs(actual_pct - percentage) < tolerance, \ + f"Expected ~{percentage}%, got {actual_pct:.1f}%" + + +@then("{percentage:Number}% should have non-null legacy_date_created") +def step_then_percentage_legacy(context: Context, percentage: int): + """Assert percentage with legacy_date_created.""" + total = len(context.queried_locations) + populated = sum(1 for loc in context.queried_locations if loc.legacy_date_created) + actual_pct = (populated / total) * 100 + + tolerance = 2 + assert abs(actual_pct - percentage) < tolerance, \ + f"Expected ~{percentage}%, got {actual_pct:.1f}%" + + +@then("{percentage:Number}% should have non-null well_completed_on") +def step_then_percentage_completion(context: Context, percentage: int): + """Assert percentage with well_completed_on.""" + total = len(context.queried_wells) + populated = sum(1 for well in context.queried_wells if well.well_completed_on) + actual_pct = (populated / total) * 100 + + tolerance = 2 + assert abs(actual_pct - percentage) < tolerance, \ + f"Expected ~{percentage}%, got {actual_pct:.1f}%" + + +@then("it should have created_at (new system timestamp from migration)") +def step_then_has_created_at(context: Context): + """Assert created_at exists.""" + assert context.retrieved_location.created_at is not None + + +@then("it should have legacy_date_created (original AMPAPI DateCreated)") +def step_then_has_legacy_date(context: Context): + """Assert legacy_date_created exists.""" + assert context.retrieved_location.legacy_date_created is not None + + +@then("it should have inventoried_on (original AMPAPI SiteDate)") +def step_then_has_inventory_date(context: Context): + """Assert inventoried_on exists.""" + assert context.retrieved_location.inventoried_on is not None + + +@then("all three timestamps should be independently queryable") +def step_then_all_queryable(context: Context): + """Assert all fields are queryable.""" + assert hasattr(context.retrieved_location, 'created_at') + assert hasattr(context.retrieved_location, 'legacy_date_created') + assert hasattr(context.retrieved_location, 'inventoried_on') + + +@then("created_at should be a recent timestamp") +def step_then_created_at_recent(context: Context): + """Assert created_at is recent.""" + created_at = context.retrieved_location.created_at.replace(tzinfo=None) + now = datetime.utcnow() + diff_seconds = abs((now - created_at).total_seconds()) + assert diff_seconds < 3600, "created_at should be within last hour" + + +@then("legacy_date_created should be an older date") +def step_then_legacy_date_older(context: Context): + """Assert legacy_date_created is old.""" + legacy_date = context.retrieved_location.legacy_date_created + assert legacy_date.year < 2024, "legacy_date_created should be from the past" + + +@then('legacy_date_created should be "{expected_date}"') +def step_then_legacy_is(context: Context, expected_date: str): + """Assert legacy_date_created value.""" + actual = context.retrieved_location.legacy_date_created + expected = date.fromisoformat(expected_date) + assert actual == expected, f"Expected {expected}, got {actual}" + + +@then('inventoried_on should be "{expected_date}"') +def step_then_inventory_is(context: Context, expected_date: str): + """Assert inventoried_on value.""" + actual = context.retrieved_location.inventoried_on + expected = date.fromisoformat(expected_date) + assert actual == expected, f"Expected {expected}, got {actual}" + + +@then("the system should accept this without error") +def step_then_no_error(context: Context): + """Assert no errors.""" + # If we got here, no errors + pass + + +@then("well_completed_on should be null") +def step_then_completion_null(context: Context): + """Assert well_completed_on is null.""" + if hasattr(context, 'retrieved_thing'): + assert context.retrieved_thing.well_completed_on is None + elif hasattr(context, 'retrieved_well'): + assert context.retrieved_well.well_completed_on is None + + +@then("the field should exist in the response schema") +def step_then_field_exists_in_schema(context: Context): + """Assert field exists in schema.""" + if hasattr(context, 'retrieved_thing'): + assert hasattr(context.retrieved_thing, 'well_completed_on') + + +@then("it should not cause validation errors") +def step_then_no_validation_errors(context: Context): + """Assert no validation errors.""" + pass + + +@then("inventoried_on should be null") +def step_then_inventory_null(context: Context): + """Assert inventoried_on is null.""" + assert context.retrieved_location.inventoried_on is None + + +@then("the well should still be valid") +def step_then_well_valid(context: Context): + """Assert well is valid.""" + assert context.retrieved_well.id is not None + + +# ============= EOF ============================================= From 6169c3eeab3487d650894a73c447af070ae32b94 Mon Sep 17 00:00:00 2001 From: jakeross Date: Wed, 26 Nov 2025 18:28:04 -0700 Subject: [PATCH 13/66] refactor: enhance water level transfer functions by introducing source table variable --- transfers/waterlevels_transducer_transfer.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/transfers/waterlevels_transducer_transfer.py b/transfers/waterlevels_transducer_transfer.py index f1ef30cd1..e4ce178c0 100644 --- a/transfers/waterlevels_transducer_transfer.py +++ b/transfers/waterlevels_transducer_transfer.py @@ -23,15 +23,19 @@ def transfer_water_levels_acoustic(session): - wd = read_csv("WaterLevelsContinuous_Acoustic") + source_table = "WaterLevelsContinuous_Acoustic" + wd = read_csv(source_table) return _transfer_water_levels_continuous( - session, wd, "PublicRelease", "Acoustic Sounder" + session, source_table, wd, "PublicRelease", "Acoustic Sounder" ) def transfer_water_levels_pressure(session): - wd = read_csv("WaterLevelsContinuous_Pressure") - return _transfer_water_levels_continuous(session, wd, "QCed", "Pressure Transducer") + source_table = "WaterLevelsContinuous_Pressure" + wd = read_csv(source_table) + return _transfer_water_levels_continuous( + session, source_table, wd, "QCed", "Pressure Transducer" + ) def _find_deployment(ts, deployments): @@ -45,7 +49,9 @@ def _find_deployment(ts, deployments): return None -def _transfer_water_levels_continuous(session, input_df, partition_field, sensor_type): +def _transfer_water_levels_continuous( + session, source_table, input_df, partition_field, sensor_type +): from schemas.transducer import CreateTransducerObservation groundwater_parameter_id = ( @@ -173,6 +179,7 @@ def _transfer_water_levels_continuous(session, input_df, partition_field, sensor for pointid, (min_date, max_date) in nodeployments.items(): errors.append( { + "table": source_table, "pointid": pointid, "error": f"no deployment between {min_date} and {max_date}", } From 952c5db040e783d2386c62a1b46410d225d0b8df Mon Sep 17 00:00:00 2001 From: Kimball Bighorse Date: Wed, 26 Nov 2025 17:31:38 -0800 Subject: [PATCH 14/66] Rename `inventoried_on` to `legacy_start_date` since it won't continue on --- ...st-migration-legacy-data-retrieval.feature | 46 +++++++++---------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/tests/features/post-migration-legacy-data-retrieval.feature b/tests/features/post-migration-legacy-data-retrieval.feature index 69d2c5506..fa4663e1b 100644 --- a/tests/features/post-migration-legacy-data-retrieval.feature +++ b/tests/features/post-migration-legacy-data-retrieval.feature @@ -14,35 +14,35 @@ Feature: Post-Migration Legacy Data Retrieval Given a location exists with: | field | value | | legacy_date_created | 2014-04-03 | - | inventoried_on | 2002-12-10 | + | legacy_site_date | 2002-12-10 | When I retrieve that location via the API Then the response should include legacy_date_created as "2014-04-03" - And the response should include inventoried_on as "2002-12-10" + And the response should include legacy_site_date as "2002-12-10" And the time gap should be approximately 11.3 years Scenario: Retrieve location with large time gap (54 years) Given a location exists with: | field | value | | legacy_date_created | 2008-05-28 | - | inventoried_on | 1954-05-01 | + | legacy_site_date | 1954-05-01 | When I retrieve that location via the API Then the response should include legacy_date_created as "2008-05-28" - And the response should include inventoried_on as "2002-12-10" + And the response should include legacy_site_date as "1954-05-01" And the time gap should be approximately 54 years Scenario: List all locations includes legacy date fields Given 5 locations exist with various legacy dates When I GET /location to list all locations Then each location should have a legacy_date_created field - And each location should have an inventoried_on field - And some locations should have null inventoried_on + And each location should have a legacy_site_date field + And some locations should have null legacy_site_date - Scenario: Filter locations by inventory date range - Given locations exist with inventoried_on ranging from 1950 to 2024 - When I filter locations where inventoried_on is between "2000-01-01" and "2010-12-31" - Then the response should only include locations inventoried in that decade - And locations inventoried before 2000 should not be included - And locations inventoried after 2010 should not be included + Scenario: Filter locations by legacy site date range + Given locations exist with legacy_site_date ranging from 1950 to 2024 + When I filter locations where legacy_site_date is between "2000-01-01" and "2010-12-31" + Then the response should only include locations with legacy_site_date in that decade + And locations with legacy_site_date before 2000 should not be included + And locations with legacy_site_date after 2010 should not be included Scenario: Query location by legacy_date_created Given 3 locations exist with legacy_date_created "2014-04-03" @@ -94,20 +94,20 @@ Feature: Post-Migration Legacy Data Retrieval And that well's location has: | field | value | | legacy_date_created | 2014-04-03 | - | inventoried_on | 2002-12-10 | + | legacy_site_date | 2002-12-10 | When I retrieve the well via the API Then the well should have well_completed_on as "2004-08-08" And the current_location should include legacy_date_created as "2014-04-03" - And the current_location should include inventoried_on as "2002-12-10" + And the current_location should include legacy_site_date as "2002-12-10" Scenario: Timeline reconstruction - well completed before site inventoried Given a well exists with well_completed_on "1995-06-15" And that well's location has: | field | value | - | inventoried_on | 2003-12-10 | + | legacy_site_date | 2003-12-10 | | legacy_date_created | 2014-04-03 | When I retrieve the well and its location - Then the temporal sequence should be: well_completed_on → inventoried_on → legacy_date_created + Then the temporal sequence should be: well_completed_on → legacy_site_date → legacy_date_created And the timeline should show: 1995 → 2003 → 2014 # Data Quality Validation @@ -116,7 +116,7 @@ Feature: Post-Migration Legacy Data Retrieval Given 100 locations were migrated And 9 of them had non-null SiteDate in AMPAPI When I query the migrated locations - Then 9% should have non-null inventoried_on + Then 9% should have non-null legacy_site_date And 100% should have non-null legacy_date_created Scenario: Verify well completion date coverage matches expectation @@ -132,7 +132,7 @@ Feature: Post-Migration Legacy Data Retrieval When I retrieve that location Then it should have created_at (new system timestamp from migration) And it should have legacy_date_created (original AMPAPI DateCreated) - And it should have inventoried_on (original AMPAPI SiteDate) + And it should have legacy_site_date (original AMPAPI SiteDate) And all three timestamps should be independently queryable And created_at should be a recent timestamp And legacy_date_created should be an older date @@ -143,10 +143,10 @@ Feature: Post-Migration Legacy Data Retrieval Given a location exists with: | field | value | | legacy_date_created | 2010-01-15 | - | inventoried_on | 2015-06-20 | + | legacy_site_date | 2015-06-20 | When I retrieve that location Then legacy_date_created should be "2010-01-15" - And inventoried_on should be "2015-06-20" + And legacy_site_date should be "2015-06-20" And the system should accept this without error Scenario: Spring does not use well_completed_on field @@ -156,14 +156,14 @@ Feature: Post-Migration Legacy Data Retrieval And the field should exist in the response schema And it should not cause validation errors - Scenario: Location with only legacy_date_created (no inventoried_on) + Scenario: Location with only legacy_date_created (no legacy_site_date) Given a location exists with: | field | value | | legacy_date_created | 2014-10-17 | - | inventoried_on | null | + | legacy_site_date | null | When I retrieve that location Then legacy_date_created should be "2014-10-17" - And inventoried_on should be null + And legacy_site_date should be null Scenario: Well without completion date Given a well exists with well_completed_on null From dbfc8ef6dfadc46ead68cdb7aad121e01f975dbe Mon Sep 17 00:00:00 2001 From: Kimball Bighorse Date: Wed, 26 Nov 2025 17:37:47 -0800 Subject: [PATCH 15/66] Add new fields to unit tests --- tests/test_location.py | 79 ++++++++++++++++++++++ tests/test_thing.py | 147 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 226 insertions(+) diff --git a/tests/test_location.py b/tests/test_location.py index 4b6ec6faa..b86211a58 100644 --- a/tests/test_location.py +++ b/tests/test_location.py @@ -235,4 +235,83 @@ def test_delete_location_404_not_found(second_location): assert data["detail"] == f"Location with ID {bad_location_id} not found." +# ============= Legacy date field tests ======================================= + + +def test_new_location_has_null_legacy_fields(): + """Test that newly created locations have null legacy date fields (legacy fields are migration-only)""" + payload = { + "point": "POINT (-106.607784 35.118924)", + "elevation": 1558.8, + "release_status": "draft", + } + response = client.post("/location", json=payload) + + assert response.status_code == 201 + data = response.json() + assert "id" in data + # Legacy fields should be present in response but null (not set during creation) + assert "legacy_date_created" in data + assert "legacy_site_date" in data + assert data["legacy_date_created"] is None + assert data["legacy_site_date"] is None + + # cleanup after test + cleanup_post_test(Location, data["id"]) + + +def test_legacy_fields_present_in_location_response(): + """Test that legacy fields are included in location GET response""" + # Create a new location (without legacy fields) + payload = { + "point": "POINT (-106.607784 35.118924)", + "elevation": 1558.8, + "release_status": "draft", + } + create_response = client.post("/location", json=payload) + assert create_response.status_code == 201 + location_id = create_response.json()["id"] + + # Retrieve the location and verify legacy fields are in the schema + get_response = client.get(f"/location/{location_id}") + assert get_response.status_code == 200 + data = get_response.json() + + # Verify fields exist in response (even if null) + assert "legacy_date_created" in data + assert "legacy_site_date" in data + assert data["legacy_date_created"] is None + assert data["legacy_site_date"] is None + + # cleanup after test + cleanup_post_test(Location, location_id) + + +def test_legacy_fields_independent_of_created_at(): + """Test that created_at (system timestamp) is separate from legacy fields""" + payload = { + "point": "POINT (-106.607784 35.118924)", + "elevation": 1558.8, + "release_status": "draft", + } + response = client.post("/location", json=payload) + + assert response.status_code == 201 + data = response.json() + + # created_at is automatically set by AutoBaseMixin + assert "created_at" in data + assert data["created_at"] is not None + + # legacy_date_created is separate and null for new records + assert "legacy_date_created" in data + assert data["legacy_date_created"] is None + + # These are independent fields with different purposes + assert "created_at" != "legacy_date_created" + + # cleanup after test + cleanup_post_test(Location, data["id"]) + + # ============= EOF ============================================= diff --git a/tests/test_thing.py b/tests/test_thing.py index 378f72d02..12aafef1a 100644 --- a/tests/test_thing.py +++ b/tests/test_thing.py @@ -1101,3 +1101,150 @@ def test_delete_thing_id_link_404_not_found(second_thing_id_link): assert response.status_code == 404 data = response.json() assert data["detail"] == f"ThingIdLink with ID {bad_id} not found." + + +# ============= Well completion date tests ==================================== + + +def test_create_well_with_completion_date(location): + """Test creating a well with well_completed_on (active field - users can set this)""" + payload = { + "name": "Test Well", + "location_id": location.id, + "measuring_point_height": 2.5, + "measuring_point_description": "top of casing", + "release_status": "draft", + "well_completed_on": "2004-08-08", + } + response = client.post("/thing/water-well", json=payload) + + assert response.status_code == 201 + data = response.json() + assert "id" in data + assert data["well_completed_on"] == "2004-08-08" + + # cleanup after test + from db import Thing + from tests import cleanup_post_test + + cleanup_post_test(Thing, data["id"]) + + +def test_create_well_with_old_completion_date(location): + """Test creating a well with very old completion date (e.g., for documenting historical wells)""" + payload = { + "name": "Historical Well", + "location_id": location.id, + "measuring_point_height": 2.5, + "measuring_point_description": "top of casing", + "release_status": "draft", + "well_completed_on": "1936-01-01", + } + response = client.post("/thing/water-well", json=payload) + + assert response.status_code == 201 + data = response.json() + assert data["well_completed_on"] == "1936-01-01" + + # cleanup after test + from db import Thing + from tests import cleanup_post_test + + cleanup_post_test(Thing, data["id"]) + + +def test_create_well_without_completion_date(location): + """Test that well_completed_on is optional (nullable) when creating a well""" + payload = { + "name": "Test Well Without Date", + "location_id": location.id, + "measuring_point_height": 2.5, + "measuring_point_description": "top of casing", + "release_status": "draft", + } + response = client.post("/thing/water-well", json=payload) + + assert response.status_code == 201 + data = response.json() + # Field should be present but null + assert "well_completed_on" in data + assert data["well_completed_on"] is None + + # cleanup after test + from db import Thing + from tests import cleanup_post_test + + cleanup_post_test(Thing, data["id"]) + + +def test_spring_well_completed_on_is_null(location): + """Test that springs have null well_completed_on field""" + payload = { + "name": "Test Spring", + "location_id": location.id, + "spring_type": "Artesian", + "release_status": "draft", + } + response = client.post("/thing/spring", json=payload) + + assert response.status_code == 201 + data = response.json() + # Springs should have null well_completed_on + assert "well_completed_on" in data + assert data["well_completed_on"] is None + assert data["thing_type"] == "spring" + + # cleanup after test + from db import Thing + from tests import cleanup_post_test + + cleanup_post_test(Thing, data["id"]) + + +def test_well_with_completion_date_and_location_legacy_fields(location): + """Test combined scenario: new well with completion date + location legacy fields (null for new locations)""" + # Create a new location (without legacy fields - they're migration-only) + from tests import cleanup_post_test + + location_payload = { + "point": "POINT (-106.607784 35.118924)", + "elevation": 1558.8, + "release_status": "draft", + } + location_response = client.post("/location", json=location_payload) + assert location_response.status_code == 201 + location_id = location_response.json()["id"] + + # Create well with completion date at that location + well_payload = { + "name": "Test Well", + "location_id": location_id, + "measuring_point_height": 2.5, + "measuring_point_description": "top of casing", + "release_status": "draft", + "well_completed_on": "2020-06-15", # User can set this for new wells + } + well_response = client.post("/thing/water-well", json=well_payload) + assert well_response.status_code == 201 + well_id = well_response.json()["id"] + + # Retrieve the well + get_response = client.get(f"/thing/water-well/{well_id}") + assert get_response.status_code == 200 + data = get_response.json() + + # well_completed_on is set (active field) + assert data["well_completed_on"] == "2020-06-15" + + # Location legacy fields are null (migration-only fields) + assert data["current_location"]["legacy_date_created"] is None + assert data["current_location"]["legacy_site_date"] is None + + # cleanup after test + from db import Thing, Location + + cleanup_post_test(Thing, well_id) + cleanup_post_test(Location, location_id) + + +# ============= EOF ============================================= From 5d519545a41fde65f176308ded37cc01b1981452 Mon Sep 17 00:00:00 2001 From: Kimball Bighorse Date: Wed, 26 Nov 2025 17:41:41 -0800 Subject: [PATCH 16/66] Create test_transfer_legacy_dates.py --- tests/test_transfer_legacy_dates.py | 410 ++++++++++++++++++++++++++++ 1 file changed, 410 insertions(+) create mode 100644 tests/test_transfer_legacy_dates.py diff --git a/tests/test_transfer_legacy_dates.py b/tests/test_transfer_legacy_dates.py new file mode 100644 index 000000000..a0cec1014 --- /dev/null +++ b/tests/test_transfer_legacy_dates.py @@ -0,0 +1,410 @@ +# =============================================================================== +# Copyright 2025 ross +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== +""" +Unit tests for legacy date field population during AMPAPI → NMSampleLocations migration. + +These tests verify that: +1. Location.legacy_date_created is populated from CSV DateCreated +2. Location.legacy_site_date is populated from CSV SiteDate (if not null) +3. Thing.well_completed_on is populated from CSV CompletionDate (if not null) +""" +import datetime +from unittest.mock import Mock, patch +import pandas as pd +import pytest + +from transfers.util import make_location +from schemas.thing import CreateWell + + +# ============================================================================ +# LOCATION LEGACY DATE TESTS +# ============================================================================ + + +def test_make_location_with_both_legacy_dates(): + """Test that make_location populates both legacy_date_created and legacy_site_date""" + # Create a mock CSV row with both DateCreated and SiteDate + row = pd.Series({ + 'PointID': 'TEST-001', + 'Easting': 350000, + 'Northing': 3880000, + 'DateCreated': '2014-04-03 00:00:00.000', + 'SiteDate': '2002-12-10 00:00:00.000', + 'Altitude': 1558.8, + 'AltDatum': 'NAVD88', + 'AltitudeMethod': 'GPS', + 'LocationId': 1, + 'PublicRelease': True, + 'CoordinateNotes': None, + 'LocationNotes': None, + 'AltitudeAccuracy': None, + }) + + elevations = {} + + # Call make_location + location, elevation_method = make_location(row, elevations) + + # Verify legacy_date_created is set from DateCreated + assert location.legacy_date_created is not None + assert location.legacy_date_created == datetime.date(2014, 4, 3) + + # Verify legacy_site_date is set from SiteDate + assert location.legacy_site_date is not None + assert location.legacy_site_date == datetime.date(2002, 12, 10) + + # Verify created_at is still set (should be the later date) + assert location.created_at is not None + + +def test_make_location_with_only_date_created(): + """Test that make_location handles locations with only DateCreated (no SiteDate)""" + row = pd.Series({ + 'PointID': 'TEST-002', + 'Easting': 350000, + 'Northing': 3880000, + 'DateCreated': '2014-04-03 00:00:00.000', + 'SiteDate': None, # No SiteDate + 'Altitude': 1558.8, + 'AltDatum': 'NAVD88', + 'AltitudeMethod': 'GPS', + 'LocationId': 2, + 'PublicRelease': True, + 'CoordinateNotes': None, + 'LocationNotes': None, + 'AltitudeAccuracy': None, + }) + + elevations = {} + location, elevation_method = make_location(row, elevations) + + # Verify legacy_date_created is set + assert location.legacy_date_created == datetime.date(2014, 4, 3) + + # Verify legacy_site_date is null (91% of locations don't have SiteDate) + assert location.legacy_site_date is None + + +def test_make_location_with_site_date_later_than_date_created(): + """Test data anomaly: SiteDate is later than DateCreated (should still be accepted)""" + row = pd.Series({ + 'PointID': 'TEST-003', + 'Easting': 350000, + 'Northing': 3880000, + 'DateCreated': '2010-01-15 00:00:00.000', + 'SiteDate': '2015-06-20 00:00:00.000', # Later than DateCreated (anomaly) + 'Altitude': 1558.8, + 'AltDatum': 'NAVD88', + 'AltitudeMethod': 'GPS', + 'LocationId': 3, + 'PublicRelease': True, + 'CoordinateNotes': None, + 'LocationNotes': None, + 'AltitudeAccuracy': None, + }) + + elevations = {} + location, elevation_method = make_location(row, elevations) + + # Both dates should be preserved as-is, regardless of order + assert location.legacy_date_created == datetime.date(2010, 1, 15) + assert location.legacy_site_date == datetime.date(2015, 6, 20) + + +def test_make_location_with_very_old_site_date(): + """Test that very old SiteDates (1950s) are preserved correctly""" + row = pd.Series({ + 'PointID': 'SM-0227', # Real example from dataset + 'Easting': 350000, + 'Northing': 3880000, + 'DateCreated': '2008-05-28 00:00:00.000', + 'SiteDate': '1954-05-01 00:00:00.000', # 54 years earlier! + 'Altitude': 1558.8, + 'AltDatum': 'NAVD88', + 'AltitudeMethod': 'GPS', + 'LocationId': 4, + 'PublicRelease': True, + 'CoordinateNotes': None, + 'LocationNotes': None, + 'AltitudeAccuracy': None, + }) + + elevations = {} + location, elevation_method = make_location(row, elevations) + + # Verify very old date is preserved + assert location.legacy_site_date == datetime.date(1954, 5, 1) + assert location.legacy_date_created == datetime.date(2008, 5, 28) + + # Verify 54-year time gap + time_gap = (location.legacy_date_created - location.legacy_site_date).days + assert time_gap == 19751 # Approximately 54 years + + +def test_make_location_legacy_dates_are_date_not_datetime(): + """Test that legacy date fields are Date type (not DateTime)""" + row = pd.Series({ + 'PointID': 'TEST-004', + 'Easting': 350000, + 'Northing': 3880000, + 'DateCreated': '2014-04-03 10:30:45.123', # Has time component + 'SiteDate': '2002-12-10 14:22:33.456', # Has time component + 'Altitude': 1558.8, + 'AltDatum': 'NAVD88', + 'AltitudeMethod': 'GPS', + 'LocationId': 5, + 'PublicRelease': True, + 'CoordinateNotes': None, + 'LocationNotes': None, + 'AltitudeAccuracy': None, + }) + + elevations = {} + location, elevation_method = make_location(row, elevations) + + # Verify they are date objects (not datetime) + assert isinstance(location.legacy_date_created, datetime.date) + assert not isinstance(location.legacy_date_created, datetime.datetime) + + assert isinstance(location.legacy_site_date, datetime.date) + assert not isinstance(location.legacy_site_date, datetime.datetime) + + # Verify time component is stripped + assert location.legacy_date_created == datetime.date(2014, 4, 3) + assert location.legacy_site_date == datetime.date(2002, 12, 10) + + +def test_make_location_legacy_dates_independent_of_created_at(): + """Test that legacy dates don't affect created_at timestamp""" + row = pd.Series({ + 'PointID': 'TEST-005', + 'Easting': 350000, + 'Northing': 3880000, + 'DateCreated': '2014-04-03 00:00:00.000', + 'SiteDate': '2002-12-10 00:00:00.000', + 'Altitude': 1558.8, + 'AltDatum': 'NAVD88', + 'AltitudeMethod': 'GPS', + 'LocationId': 6, + 'PublicRelease': True, + 'CoordinateNotes': None, + 'LocationNotes': None, + 'AltitudeAccuracy': None, + }) + + elevations = {} + location, elevation_method = make_location(row, elevations) + + # created_at should be a DateTime (with timezone) + assert isinstance(location.created_at, datetime.datetime) + + # legacy fields should be Date (no timezone) + assert isinstance(location.legacy_date_created, datetime.date) + assert isinstance(location.legacy_site_date, datetime.date) + + # They should be independent + assert location.created_at is not None + assert location.legacy_date_created is not None + assert location.legacy_site_date is not None + + +# ============================================================================ +# WELL COMPLETION DATE TESTS +# ============================================================================ + + +def test_create_well_schema_accepts_well_completed_on(): + """Test that CreateWell schema accepts well_completed_on from CSV CompletionDate""" + # Simulate data from CSV transfer + well_data = { + 'location_id': 1, + 'name': 'TEST-WELL-001', + 'well_completed_on': datetime.date(2004, 8, 8), # From CSV CompletionDate + 'hole_depth': 100.0, + 'well_depth': 95.0, + 'measuring_point_height': 2.5, + 'measuring_point_description': 'top of casing', + 'release_status': 'public', + } + + # Validate using CreateWell schema + schema = CreateWell(**well_data) + + assert schema.well_completed_on == datetime.date(2004, 8, 8) + + +def test_create_well_schema_well_completed_on_optional(): + """Test that well_completed_on is optional (70% of wells don't have CompletionDate)""" + well_data = { + 'location_id': 1, + 'name': 'TEST-WELL-002', + 'hole_depth': 100.0, + 'well_depth': 95.0, + 'measuring_point_height': 2.5, + 'measuring_point_description': 'top of casing', + 'release_status': 'public', + # No well_completed_on provided + } + + # Should not raise validation error + schema = CreateWell(**well_data) + + # Field should be optional + assert hasattr(schema, 'well_completed_on') + # Value should be None when not provided + assert schema.well_completed_on is None + + +def test_create_well_with_very_old_completion_date(): + """Test that very old completion dates (1936) are accepted""" + well_data = { + 'location_id': 1, + 'name': 'HISTORICAL-WELL', + 'well_completed_on': datetime.date(1936, 1, 1), # Oldest well in dataset + 'hole_depth': 100.0, + 'well_depth': 95.0, + 'measuring_point_height': 2.5, + 'measuring_point_description': 'top of casing', + 'release_status': 'public', + } + + schema = CreateWell(**well_data) + + assert schema.well_completed_on == datetime.date(1936, 1, 1) + + +def test_create_well_completed_on_is_date_not_datetime(): + """Test that well_completed_on is Date type (not DateTime)""" + well_data = { + 'location_id': 1, + 'name': 'TEST-WELL-003', + 'well_completed_on': datetime.date(2004, 8, 8), # Date, not DateTime + 'hole_depth': 100.0, + 'well_depth': 95.0, + 'measuring_point_height': 2.5, + 'measuring_point_description': 'top of casing', + 'release_status': 'public', + } + + schema = CreateWell(**well_data) + + # Should accept date type + assert isinstance(schema.well_completed_on, datetime.date) + assert not isinstance(schema.well_completed_on, datetime.datetime) + + +# ============================================================================ +# DATA COVERAGE TESTS (Simulating Migration Statistics) +# ============================================================================ + + +def test_location_legacy_date_coverage_statistics(): + """Test that migration preserves expected percentages of legacy dates""" + # Simulate 100 location records from CSV + locations_created = 0 + locations_with_site_date = 0 + + for i in range(100): + if i < 9: # 9% have SiteDate + row = pd.Series({ + 'PointID': f'TEST-{i:03d}', + 'Easting': 350000 + i, + 'Northing': 3880000 + i, + 'DateCreated': '2014-04-03 00:00:00.000', + 'SiteDate': '2002-12-10 00:00:00.000', + 'Altitude': 1558.8, + 'AltDatum': 'NAVD88', + 'AltitudeMethod': 'GPS', + 'LocationId': i, + 'PublicRelease': True, + 'CoordinateNotes': None, + 'LocationNotes': None, + 'AltitudeAccuracy': None, + }) + else: # 91% don't have SiteDate + row = pd.Series({ + 'PointID': f'TEST-{i:03d}', + 'Easting': 350000 + i, + 'Northing': 3880000 + i, + 'DateCreated': '2014-04-03 00:00:00.000', + 'SiteDate': None, + 'Altitude': 1558.8, + 'AltDatum': 'NAVD88', + 'AltitudeMethod': 'GPS', + 'LocationId': i, + 'PublicRelease': True, + 'CoordinateNotes': None, + 'LocationNotes': None, + 'AltitudeAccuracy': None, + }) + + elevations = {} + location, _ = make_location(row, elevations) + + # Count coverage + if location.legacy_date_created is not None: + locations_created += 1 + + if location.legacy_site_date is not None: + locations_with_site_date += 1 + + # Verify expected coverage + assert locations_created == 100 # 100% should have legacy_date_created + assert locations_with_site_date == 9 # 9% should have legacy_site_date + + +def test_well_completion_date_coverage_statistics(): + """Test that expected percentage of wells have completion dates""" + # Simulate 100 wells from CSV + wells_with_completion_date = 0 + + for i in range(100): + if i < 30: # 30% have CompletionDate + well_data = { + 'location_id': 1, + 'name': f'WELL-{i:03d}', + 'well_completed_on': datetime.date(2004, 8, 8), + 'hole_depth': 100.0, + 'well_depth': 95.0, + 'measuring_point_height': 2.5, + 'measuring_point_description': 'top of casing', + 'release_status': 'public', + } + else: # 70% don't have CompletionDate + well_data = { + 'location_id': 1, + 'name': f'WELL-{i:03d}', + 'hole_depth': 100.0, + 'well_depth': 95.0, + 'measuring_point_height': 2.5, + 'measuring_point_description': 'top of casing', + 'release_status': 'public', + # No well_completed_on + } + + schema = CreateWell(**well_data) + + if schema.well_completed_on is not None: + wells_with_completion_date += 1 + + # Verify expected coverage + assert wells_with_completion_date == 30 # 30% should have completion dates + + +# ============================================================================ +# EOF +# ============================================================================ From 687fb4aa1b5c4060f14d0fe140b78572f5909c9f Mon Sep 17 00:00:00 2001 From: Kimball Bighorse Date: Wed, 26 Nov 2025 18:05:52 -0800 Subject: [PATCH 17/66] Support changes in unit tests for thing and transfer script --- tests/test_thing.py | 5 +++-- tests/test_transfer_legacy_dates.py | 11 +++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/test_thing.py b/tests/test_thing.py index 12aafef1a..3d76e3b99 100644 --- a/tests/test_thing.py +++ b/tests/test_thing.py @@ -1237,8 +1237,9 @@ def test_well_with_completion_date_and_location_legacy_fields(location): assert data["well_completed_on"] == "2020-06-15" # Location legacy fields are null (migration-only fields) - assert data["current_location"]["legacy_date_created"] is None - assert data["current_location"]["legacy_site_date"] is None + # current_location is a GeoJSON Feature, so fields are under properties + assert data["current_location"]["properties"]["legacy_date_created"] is None + assert data["current_location"]["properties"]["legacy_site_date"] is None # cleanup after test from db import Thing, Location diff --git a/tests/test_transfer_legacy_dates.py b/tests/test_transfer_legacy_dates.py index a0cec1014..53f304c4a 100644 --- a/tests/test_transfer_legacy_dates.py +++ b/tests/test_transfer_legacy_dates.py @@ -67,8 +67,8 @@ def test_make_location_with_both_legacy_dates(): assert location.legacy_site_date is not None assert location.legacy_site_date == datetime.date(2002, 12, 10) - # Verify created_at is still set (should be the later date) - assert location.created_at is not None + # Verify created_at is NOT set during migration (it's auto-set by AutoBaseMixin on save) + assert location.created_at is None def test_make_location_with_only_date_created(): @@ -209,15 +209,14 @@ def test_make_location_legacy_dates_independent_of_created_at(): elevations = {} location, elevation_method = make_location(row, elevations) - # created_at should be a DateTime (with timezone) - assert isinstance(location.created_at, datetime.datetime) + # created_at should be None during transfer (auto-set by AutoBaseMixin on save) + assert location.created_at is None # legacy fields should be Date (no timezone) assert isinstance(location.legacy_date_created, datetime.date) assert isinstance(location.legacy_site_date, datetime.date) - # They should be independent - assert location.created_at is not None + # Legacy fields should be populated assert location.legacy_date_created is not None assert location.legacy_site_date is not None From 6552bc00fc3560fcd8abfae02486d6a1363d61e5 Mon Sep 17 00:00:00 2001 From: Kimball Bighorse Date: Wed, 26 Nov 2025 18:06:34 -0800 Subject: [PATCH 18/66] Implement changes in db and schemas --- db/location.py | 9 +++++++++ db/thing.py | 5 +++++ schemas/location.py | 15 +++++++++++++++ schemas/thing.py | 7 +++++++ 4 files changed, 36 insertions(+) diff --git a/db/location.py b/db/location.py index 50b1aa0db..3b4271592 100644 --- a/db/location.py +++ b/db/location.py @@ -23,6 +23,7 @@ String, ForeignKey, DateTime, + Date, func, Text, ) @@ -61,6 +62,14 @@ class Location(Base, AutoBaseMixin, ReleaseMixin, NotesMixin, DataProvenanceMixi nma_notes_location: Mapped[str] = mapped_column(Text, nullable=True) nma_coordinate_notes: Mapped[str] = mapped_column(Text, nullable=True) + # --- Legacy AMPAPI Date Fields (Migration-Only, Read-Only Post-Migration) --- + legacy_date_created: Mapped[datetime.date] = mapped_column( + Date, nullable=True, comment="Original AMPAPI DateCreated (migration-only field)" + ) + legacy_site_date: Mapped[datetime.date] = mapped_column( + Date, nullable=True, comment="Original AMPAPI SiteDate (migration-only field)" + ) + # --- Relationship Definitions --- thing_associations: Mapped[list["LocationThingAssociation"]] = relationship( back_populates="location", cascade="all, delete-orphan" diff --git a/db/thing.py b/db/thing.py index 9f30d08e2..b42b70d56 100644 --- a/db/thing.py +++ b/db/thing.py @@ -115,6 +115,11 @@ class Thing( ) well_construction_notes: Mapped[str] = mapped_column(Text, nullable=True) + well_completed_on: Mapped[date] = mapped_column( + Date, + nullable=True, + comment="Date when well construction/drilling was completed (from AMPAPI CompletionDate, active field for new wells)", + ) # Spring-related columns spring_type: Mapped[str] = lexicon_term( diff --git a/schemas/location.py b/schemas/location.py index e911e3359..e18b76996 100644 --- a/schemas/location.py +++ b/schemas/location.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # =============================================================================== +from datetime import date from typing import List from geoalchemy2 import WKBElement @@ -106,6 +107,9 @@ class GeoJSONProperties(BaseModel): default_factory=GeoJSONUTMCoordinates ) notes: list[NoteResponse] = [] + # Legacy AMPAPI date fields (migration-only, read-only) + legacy_date_created: date | None = None + legacy_site_date: date | None = None model_config = ConfigDict( from_attributes=True, @@ -150,6 +154,9 @@ def populate_fields(cls, data: Any) -> Any: data_dict["properties"]["notes"] = data_dict.get("notes") data_dict["properties"]["elevation"] = convert_m_to_ft(elevation_m) data_dict["properties"]["elevation_method"] = data_dict.get("elevation_method") + # populate legacy date fields + data_dict["properties"]["legacy_date_created"] = data_dict.get("legacy_date_created") + data_dict["properties"]["legacy_site_date"] = data_dict.get("legacy_site_date") # populate UTM coordinates point_utm_zone_13n_wkt = transform_srid( @@ -181,6 +188,10 @@ class LocationResponse(BaseResponseModel): county: str | None quad_name: str | None + # Legacy AMPAPI date fields (migration-only, read-only post-migration) + legacy_date_created: date | None = None + legacy_site_date: date | None = None + @field_validator("point", mode="before") def point_to_wkt(cls, value): if isinstance(value, WKBElement): @@ -219,5 +230,9 @@ class UpdateLocation(BaseUpdateModel, ValidateLocation): coordinate_accuracy: float | None = None coordinate_method: CoordinateMethod | None = None + # Legacy AMPAPI date fields (migration-only, can be updated but not created) + legacy_date_created: date | None = None + legacy_site_date: date | None = None + # ============= EOF ============================================= diff --git a/schemas/thing.py b/schemas/thing.py index cf8c3ef2b..6de5908cc 100644 --- a/schemas/thing.py +++ b/schemas/thing.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # =============================================================================== +from datetime import date from typing import List from pydantic import BaseModel, model_validator, Field, field_validator @@ -130,6 +131,8 @@ class CreateWell(CreateBaseThing, ValidateWell): ) measuring_point_description: str | None notes: list[CreateNote] | None = None + # Active field: users can set this for new wells + well_completed_on: date | None = None class CreateSpring(CreateBaseThing): @@ -224,6 +227,8 @@ class WellResponse(BaseThingResponse): measuring_point_height: float measuring_point_height_unit: str = "ft" measuring_point_description: str | None + # Active field: completion date for wells + well_completed_on: date | None = None water_notes: list[NoteResponse] | None = None measuring_notes: list[NoteResponse] | None = None @@ -329,6 +334,8 @@ class UpdateWell(UpdateThing, ValidateWell): well_casing_diameter: float | None = None # in inches well_casing_depth: float | None = None # in feet well_casing_materials: list[str] | None = None + # Active field: users can update completion date + well_completed_on: date | None = None class UpdateSpring(UpdateThing): From 08fb22105834b3fa70dc030cffb0af246bf3471b Mon Sep 17 00:00:00 2001 From: Kimball Bighorse Date: Wed, 26 Nov 2025 18:06:49 -0800 Subject: [PATCH 19/66] Implement changes in transfer scripts --- transfers/util.py | 43 +++++++++++++------------------------- transfers/well_transfer.py | 15 +++++++++++++ 2 files changed, 30 insertions(+), 28 deletions(-) diff --git a/transfers/util.py b/transfers/util.py index d08798425..d39845f44 100644 --- a/transfers/util.py +++ b/transfers/util.py @@ -214,33 +214,6 @@ def make_location(row: pd.Series, elevations: dict) -> tuple: point, source_srid=SRID_UTM_ZONE_13N, target_srid=SRID_WGS84 ) - """ - Developer's notes - - AMP folks said that the earlier date between DateCreated and SiteDate is when - the site was inventoried, whereas the later is when the record was made in - the database. This was because they were used interchangeably. - """ - if row.DateCreated and row.SiteDate: - - date_created = datetime.strptime(row.DateCreated, "%Y-%m-%d %H:%M:%S.%f") - site_date = datetime.strptime(row.SiteDate, "%Y-%m-%d %H:%M:%S.%f") - - if date_created > site_date: - created_at = date_created - else: - created_at = site_date - elif row.DateCreated and not row.SiteDate: - created_at = datetime.strptime(row.DateCreated, "%Y-%m-%d %H:%M:%S.%f") - elif not row.DateCreated and row.SiteDate: - created_at = datetime.strptime(row.SiteDate, "%Y-%m-%d %H:%M:%S.%f") - else: - created_at = None - - # convert created_at from MST/MDT to UTC - if created_at is not None: - created_at = convert_mt_to_utc(created_at) - z = row.Altitude if z: elevation_from_epqs = False @@ -271,14 +244,28 @@ def make_location(row: pd.Series, elevations: dict) -> tuple: f"LU_AltitudeMethod:{row.AltitudeMethod.strip()}" ) + # Extract legacy date fields (Date type, not DateTime) + legacy_date_created = None + if row.DateCreated: + legacy_date_created = datetime.strptime( + row.DateCreated, "%Y-%m-%d %H:%M:%S.%f" + ).date() + + legacy_site_date = None + if row.SiteDate: + legacy_site_date = datetime.strptime( + row.SiteDate, "%Y-%m-%d %H:%M:%S.%f" + ).date() + location = Location( nma_pk_location=row.LocationId, point=transformed_point.wkt, elevation=z, release_status="public" if row.PublicRelease else "private", - created_at=created_at, nma_coordinate_notes=row.CoordinateNotes, nma_notes_location=row.LocationNotes, + legacy_date_created=legacy_date_created, + legacy_site_date=legacy_site_date, ) return location, elevation_method diff --git a/transfers/well_transfer.py b/transfers/well_transfer.py index ee54d0216..5daa1d8ee 100644 --- a/transfers/well_transfer.py +++ b/transfers/well_transfer.py @@ -237,6 +237,19 @@ def transfer_wells(session: Session, flags: dict = None, limit: int = 0) -> None [] if isna(row.CasingDescription) else _extract_casing_materials(row) ) + # Extract well_completed_on from CompletionDate (Date type, not DateTime) + well_completed_on = None + if not isna(row.CompletionDate): + try: + well_completed_on = datetime.strptime( + row.CompletionDate, "%Y-%m-%d %H:%M:%S.%f" + ).date() + except (ValueError, AttributeError): + # If parsing fails, leave as None + logger.warning( + f"Could not parse CompletionDate for {row.PointID}: {row.CompletionDate}" + ) + # manually add the well rather than add_well from services/thing_helper.py # so that effective_start can be set on the location assocation @@ -254,6 +267,7 @@ def transfer_wells(session: Session, flags: dict = None, limit: int = 0) -> None release_status="public" if row.PublicRelease else "private", measuring_point_height=row.MPHeight, measuring_point_description=row.MeasuringPoint, + well_completed_on=well_completed_on, notes=( [{"content": row.Notes, "note_type": "Other"}] if row.Notes else [] ), @@ -283,6 +297,7 @@ def transfer_wells(session: Session, flags: dict = None, limit: int = 0) -> None well_data["nma_pk_welldata"] = row.WellID well_data.pop("notes") + # well_completed_on is kept in well_data (not excluded above) well = Thing(**well_data) session.add(well) # logger.info(f"Created well for {row.PointID}") From 47aad3f14d0bbe059299cc919f332c8d1d7febcf Mon Sep 17 00:00:00 2001 From: Kimball Bighorse Date: Wed, 26 Nov 2025 18:07:08 -0800 Subject: [PATCH 20/66] Address measuring point bug --- services/thing_helper.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/services/thing_helper.py b/services/thing_helper.py index 53ce54577..084a8b02b 100644 --- a/services/thing_helper.py +++ b/services/thing_helper.py @@ -13,6 +13,9 @@ # See the License for the specific language governing permissions and # limitations under the License. # =============================================================================== +from datetime import datetime +from zoneinfo import ZoneInfo + from fastapi import Request from fastapi_pagination.ext.sqlalchemy import paginate from pydantic import BaseModel @@ -32,6 +35,7 @@ WellCasingMaterial, ) from db.group import GroupThingAssociation +from db.measuring_point_history import MeasuringPointHistory from services.audit_helper import audit_add from services.crud_helper import model_patcher from services.exceptions_helper import PydanticStyleException @@ -159,6 +163,10 @@ def add_thing( location_id = data.pop("location_id", None) group_id = data.pop("group_id", None) + # Extract measuring point data (stored in separate history table) + measuring_point_height = data.pop("measuring_point_height", None) + measuring_point_description = data.pop("measuring_point_description", None) + try: thing = Thing(**data) thing.thing_type = thing_type @@ -169,6 +177,18 @@ def add_thing( session.flush() session.refresh(thing) + # Create MeasuringPointHistory record if measuring_point_height provided + if measuring_point_height is not None: + measuring_point_history = MeasuringPointHistory( + thing_id=thing.id, + measuring_point_height=measuring_point_height, + measuring_point_description=measuring_point_description, + start_date=datetime.now(tz=ZoneInfo("UTC")), + end_date=None, + ) + audit_add(user, measuring_point_history) + session.add(measuring_point_history) + # endpoint catches ProgrammingError if location_id or group_id do not exist if group_id: assoc = GroupThingAssociation() From 546b7013286c37529b5e2a8e0524ae09daac1f5f Mon Sep 17 00:00:00 2001 From: kbighorse Date: Thu, 27 Nov 2025 02:07:05 +0000 Subject: [PATCH 21/66] Formatting changes --- db/location.py | 4 +- schemas/location.py | 4 +- tests/test_transfer_legacy_dates.py | 350 +++++++++++++++------------- 3 files changed, 189 insertions(+), 169 deletions(-) diff --git a/db/location.py b/db/location.py index 3b4271592..a07958346 100644 --- a/db/location.py +++ b/db/location.py @@ -64,7 +64,9 @@ class Location(Base, AutoBaseMixin, ReleaseMixin, NotesMixin, DataProvenanceMixi # --- Legacy AMPAPI Date Fields (Migration-Only, Read-Only Post-Migration) --- legacy_date_created: Mapped[datetime.date] = mapped_column( - Date, nullable=True, comment="Original AMPAPI DateCreated (migration-only field)" + Date, + nullable=True, + comment="Original AMPAPI DateCreated (migration-only field)", ) legacy_site_date: Mapped[datetime.date] = mapped_column( Date, nullable=True, comment="Original AMPAPI SiteDate (migration-only field)" diff --git a/schemas/location.py b/schemas/location.py index e18b76996..1f4bad472 100644 --- a/schemas/location.py +++ b/schemas/location.py @@ -155,7 +155,9 @@ def populate_fields(cls, data: Any) -> Any: data_dict["properties"]["elevation"] = convert_m_to_ft(elevation_m) data_dict["properties"]["elevation_method"] = data_dict.get("elevation_method") # populate legacy date fields - data_dict["properties"]["legacy_date_created"] = data_dict.get("legacy_date_created") + data_dict["properties"]["legacy_date_created"] = data_dict.get( + "legacy_date_created" + ) data_dict["properties"]["legacy_site_date"] = data_dict.get("legacy_site_date") # populate UTM coordinates diff --git a/tests/test_transfer_legacy_dates.py b/tests/test_transfer_legacy_dates.py index 53f304c4a..30fbcd5ae 100644 --- a/tests/test_transfer_legacy_dates.py +++ b/tests/test_transfer_legacy_dates.py @@ -38,21 +38,23 @@ def test_make_location_with_both_legacy_dates(): """Test that make_location populates both legacy_date_created and legacy_site_date""" # Create a mock CSV row with both DateCreated and SiteDate - row = pd.Series({ - 'PointID': 'TEST-001', - 'Easting': 350000, - 'Northing': 3880000, - 'DateCreated': '2014-04-03 00:00:00.000', - 'SiteDate': '2002-12-10 00:00:00.000', - 'Altitude': 1558.8, - 'AltDatum': 'NAVD88', - 'AltitudeMethod': 'GPS', - 'LocationId': 1, - 'PublicRelease': True, - 'CoordinateNotes': None, - 'LocationNotes': None, - 'AltitudeAccuracy': None, - }) + row = pd.Series( + { + "PointID": "TEST-001", + "Easting": 350000, + "Northing": 3880000, + "DateCreated": "2014-04-03 00:00:00.000", + "SiteDate": "2002-12-10 00:00:00.000", + "Altitude": 1558.8, + "AltDatum": "NAVD88", + "AltitudeMethod": "GPS", + "LocationId": 1, + "PublicRelease": True, + "CoordinateNotes": None, + "LocationNotes": None, + "AltitudeAccuracy": None, + } + ) elevations = {} @@ -73,21 +75,23 @@ def test_make_location_with_both_legacy_dates(): def test_make_location_with_only_date_created(): """Test that make_location handles locations with only DateCreated (no SiteDate)""" - row = pd.Series({ - 'PointID': 'TEST-002', - 'Easting': 350000, - 'Northing': 3880000, - 'DateCreated': '2014-04-03 00:00:00.000', - 'SiteDate': None, # No SiteDate - 'Altitude': 1558.8, - 'AltDatum': 'NAVD88', - 'AltitudeMethod': 'GPS', - 'LocationId': 2, - 'PublicRelease': True, - 'CoordinateNotes': None, - 'LocationNotes': None, - 'AltitudeAccuracy': None, - }) + row = pd.Series( + { + "PointID": "TEST-002", + "Easting": 350000, + "Northing": 3880000, + "DateCreated": "2014-04-03 00:00:00.000", + "SiteDate": None, # No SiteDate + "Altitude": 1558.8, + "AltDatum": "NAVD88", + "AltitudeMethod": "GPS", + "LocationId": 2, + "PublicRelease": True, + "CoordinateNotes": None, + "LocationNotes": None, + "AltitudeAccuracy": None, + } + ) elevations = {} location, elevation_method = make_location(row, elevations) @@ -101,21 +105,23 @@ def test_make_location_with_only_date_created(): def test_make_location_with_site_date_later_than_date_created(): """Test data anomaly: SiteDate is later than DateCreated (should still be accepted)""" - row = pd.Series({ - 'PointID': 'TEST-003', - 'Easting': 350000, - 'Northing': 3880000, - 'DateCreated': '2010-01-15 00:00:00.000', - 'SiteDate': '2015-06-20 00:00:00.000', # Later than DateCreated (anomaly) - 'Altitude': 1558.8, - 'AltDatum': 'NAVD88', - 'AltitudeMethod': 'GPS', - 'LocationId': 3, - 'PublicRelease': True, - 'CoordinateNotes': None, - 'LocationNotes': None, - 'AltitudeAccuracy': None, - }) + row = pd.Series( + { + "PointID": "TEST-003", + "Easting": 350000, + "Northing": 3880000, + "DateCreated": "2010-01-15 00:00:00.000", + "SiteDate": "2015-06-20 00:00:00.000", # Later than DateCreated (anomaly) + "Altitude": 1558.8, + "AltDatum": "NAVD88", + "AltitudeMethod": "GPS", + "LocationId": 3, + "PublicRelease": True, + "CoordinateNotes": None, + "LocationNotes": None, + "AltitudeAccuracy": None, + } + ) elevations = {} location, elevation_method = make_location(row, elevations) @@ -127,21 +133,23 @@ def test_make_location_with_site_date_later_than_date_created(): def test_make_location_with_very_old_site_date(): """Test that very old SiteDates (1950s) are preserved correctly""" - row = pd.Series({ - 'PointID': 'SM-0227', # Real example from dataset - 'Easting': 350000, - 'Northing': 3880000, - 'DateCreated': '2008-05-28 00:00:00.000', - 'SiteDate': '1954-05-01 00:00:00.000', # 54 years earlier! - 'Altitude': 1558.8, - 'AltDatum': 'NAVD88', - 'AltitudeMethod': 'GPS', - 'LocationId': 4, - 'PublicRelease': True, - 'CoordinateNotes': None, - 'LocationNotes': None, - 'AltitudeAccuracy': None, - }) + row = pd.Series( + { + "PointID": "SM-0227", # Real example from dataset + "Easting": 350000, + "Northing": 3880000, + "DateCreated": "2008-05-28 00:00:00.000", + "SiteDate": "1954-05-01 00:00:00.000", # 54 years earlier! + "Altitude": 1558.8, + "AltDatum": "NAVD88", + "AltitudeMethod": "GPS", + "LocationId": 4, + "PublicRelease": True, + "CoordinateNotes": None, + "LocationNotes": None, + "AltitudeAccuracy": None, + } + ) elevations = {} location, elevation_method = make_location(row, elevations) @@ -157,21 +165,23 @@ def test_make_location_with_very_old_site_date(): def test_make_location_legacy_dates_are_date_not_datetime(): """Test that legacy date fields are Date type (not DateTime)""" - row = pd.Series({ - 'PointID': 'TEST-004', - 'Easting': 350000, - 'Northing': 3880000, - 'DateCreated': '2014-04-03 10:30:45.123', # Has time component - 'SiteDate': '2002-12-10 14:22:33.456', # Has time component - 'Altitude': 1558.8, - 'AltDatum': 'NAVD88', - 'AltitudeMethod': 'GPS', - 'LocationId': 5, - 'PublicRelease': True, - 'CoordinateNotes': None, - 'LocationNotes': None, - 'AltitudeAccuracy': None, - }) + row = pd.Series( + { + "PointID": "TEST-004", + "Easting": 350000, + "Northing": 3880000, + "DateCreated": "2014-04-03 10:30:45.123", # Has time component + "SiteDate": "2002-12-10 14:22:33.456", # Has time component + "Altitude": 1558.8, + "AltDatum": "NAVD88", + "AltitudeMethod": "GPS", + "LocationId": 5, + "PublicRelease": True, + "CoordinateNotes": None, + "LocationNotes": None, + "AltitudeAccuracy": None, + } + ) elevations = {} location, elevation_method = make_location(row, elevations) @@ -190,21 +200,23 @@ def test_make_location_legacy_dates_are_date_not_datetime(): def test_make_location_legacy_dates_independent_of_created_at(): """Test that legacy dates don't affect created_at timestamp""" - row = pd.Series({ - 'PointID': 'TEST-005', - 'Easting': 350000, - 'Northing': 3880000, - 'DateCreated': '2014-04-03 00:00:00.000', - 'SiteDate': '2002-12-10 00:00:00.000', - 'Altitude': 1558.8, - 'AltDatum': 'NAVD88', - 'AltitudeMethod': 'GPS', - 'LocationId': 6, - 'PublicRelease': True, - 'CoordinateNotes': None, - 'LocationNotes': None, - 'AltitudeAccuracy': None, - }) + row = pd.Series( + { + "PointID": "TEST-005", + "Easting": 350000, + "Northing": 3880000, + "DateCreated": "2014-04-03 00:00:00.000", + "SiteDate": "2002-12-10 00:00:00.000", + "Altitude": 1558.8, + "AltDatum": "NAVD88", + "AltitudeMethod": "GPS", + "LocationId": 6, + "PublicRelease": True, + "CoordinateNotes": None, + "LocationNotes": None, + "AltitudeAccuracy": None, + } + ) elevations = {} location, elevation_method = make_location(row, elevations) @@ -230,14 +242,14 @@ def test_create_well_schema_accepts_well_completed_on(): """Test that CreateWell schema accepts well_completed_on from CSV CompletionDate""" # Simulate data from CSV transfer well_data = { - 'location_id': 1, - 'name': 'TEST-WELL-001', - 'well_completed_on': datetime.date(2004, 8, 8), # From CSV CompletionDate - 'hole_depth': 100.0, - 'well_depth': 95.0, - 'measuring_point_height': 2.5, - 'measuring_point_description': 'top of casing', - 'release_status': 'public', + "location_id": 1, + "name": "TEST-WELL-001", + "well_completed_on": datetime.date(2004, 8, 8), # From CSV CompletionDate + "hole_depth": 100.0, + "well_depth": 95.0, + "measuring_point_height": 2.5, + "measuring_point_description": "top of casing", + "release_status": "public", } # Validate using CreateWell schema @@ -249,13 +261,13 @@ def test_create_well_schema_accepts_well_completed_on(): def test_create_well_schema_well_completed_on_optional(): """Test that well_completed_on is optional (70% of wells don't have CompletionDate)""" well_data = { - 'location_id': 1, - 'name': 'TEST-WELL-002', - 'hole_depth': 100.0, - 'well_depth': 95.0, - 'measuring_point_height': 2.5, - 'measuring_point_description': 'top of casing', - 'release_status': 'public', + "location_id": 1, + "name": "TEST-WELL-002", + "hole_depth": 100.0, + "well_depth": 95.0, + "measuring_point_height": 2.5, + "measuring_point_description": "top of casing", + "release_status": "public", # No well_completed_on provided } @@ -263,7 +275,7 @@ def test_create_well_schema_well_completed_on_optional(): schema = CreateWell(**well_data) # Field should be optional - assert hasattr(schema, 'well_completed_on') + assert hasattr(schema, "well_completed_on") # Value should be None when not provided assert schema.well_completed_on is None @@ -271,14 +283,14 @@ def test_create_well_schema_well_completed_on_optional(): def test_create_well_with_very_old_completion_date(): """Test that very old completion dates (1936) are accepted""" well_data = { - 'location_id': 1, - 'name': 'HISTORICAL-WELL', - 'well_completed_on': datetime.date(1936, 1, 1), # Oldest well in dataset - 'hole_depth': 100.0, - 'well_depth': 95.0, - 'measuring_point_height': 2.5, - 'measuring_point_description': 'top of casing', - 'release_status': 'public', + "location_id": 1, + "name": "HISTORICAL-WELL", + "well_completed_on": datetime.date(1936, 1, 1), # Oldest well in dataset + "hole_depth": 100.0, + "well_depth": 95.0, + "measuring_point_height": 2.5, + "measuring_point_description": "top of casing", + "release_status": "public", } schema = CreateWell(**well_data) @@ -289,14 +301,14 @@ def test_create_well_with_very_old_completion_date(): def test_create_well_completed_on_is_date_not_datetime(): """Test that well_completed_on is Date type (not DateTime)""" well_data = { - 'location_id': 1, - 'name': 'TEST-WELL-003', - 'well_completed_on': datetime.date(2004, 8, 8), # Date, not DateTime - 'hole_depth': 100.0, - 'well_depth': 95.0, - 'measuring_point_height': 2.5, - 'measuring_point_description': 'top of casing', - 'release_status': 'public', + "location_id": 1, + "name": "TEST-WELL-003", + "well_completed_on": datetime.date(2004, 8, 8), # Date, not DateTime + "hole_depth": 100.0, + "well_depth": 95.0, + "measuring_point_height": 2.5, + "measuring_point_description": "top of casing", + "release_status": "public", } schema = CreateWell(**well_data) @@ -319,37 +331,41 @@ def test_location_legacy_date_coverage_statistics(): for i in range(100): if i < 9: # 9% have SiteDate - row = pd.Series({ - 'PointID': f'TEST-{i:03d}', - 'Easting': 350000 + i, - 'Northing': 3880000 + i, - 'DateCreated': '2014-04-03 00:00:00.000', - 'SiteDate': '2002-12-10 00:00:00.000', - 'Altitude': 1558.8, - 'AltDatum': 'NAVD88', - 'AltitudeMethod': 'GPS', - 'LocationId': i, - 'PublicRelease': True, - 'CoordinateNotes': None, - 'LocationNotes': None, - 'AltitudeAccuracy': None, - }) + row = pd.Series( + { + "PointID": f"TEST-{i:03d}", + "Easting": 350000 + i, + "Northing": 3880000 + i, + "DateCreated": "2014-04-03 00:00:00.000", + "SiteDate": "2002-12-10 00:00:00.000", + "Altitude": 1558.8, + "AltDatum": "NAVD88", + "AltitudeMethod": "GPS", + "LocationId": i, + "PublicRelease": True, + "CoordinateNotes": None, + "LocationNotes": None, + "AltitudeAccuracy": None, + } + ) else: # 91% don't have SiteDate - row = pd.Series({ - 'PointID': f'TEST-{i:03d}', - 'Easting': 350000 + i, - 'Northing': 3880000 + i, - 'DateCreated': '2014-04-03 00:00:00.000', - 'SiteDate': None, - 'Altitude': 1558.8, - 'AltDatum': 'NAVD88', - 'AltitudeMethod': 'GPS', - 'LocationId': i, - 'PublicRelease': True, - 'CoordinateNotes': None, - 'LocationNotes': None, - 'AltitudeAccuracy': None, - }) + row = pd.Series( + { + "PointID": f"TEST-{i:03d}", + "Easting": 350000 + i, + "Northing": 3880000 + i, + "DateCreated": "2014-04-03 00:00:00.000", + "SiteDate": None, + "Altitude": 1558.8, + "AltDatum": "NAVD88", + "AltitudeMethod": "GPS", + "LocationId": i, + "PublicRelease": True, + "CoordinateNotes": None, + "LocationNotes": None, + "AltitudeAccuracy": None, + } + ) elevations = {} location, _ = make_location(row, elevations) @@ -374,24 +390,24 @@ def test_well_completion_date_coverage_statistics(): for i in range(100): if i < 30: # 30% have CompletionDate well_data = { - 'location_id': 1, - 'name': f'WELL-{i:03d}', - 'well_completed_on': datetime.date(2004, 8, 8), - 'hole_depth': 100.0, - 'well_depth': 95.0, - 'measuring_point_height': 2.5, - 'measuring_point_description': 'top of casing', - 'release_status': 'public', + "location_id": 1, + "name": f"WELL-{i:03d}", + "well_completed_on": datetime.date(2004, 8, 8), + "hole_depth": 100.0, + "well_depth": 95.0, + "measuring_point_height": 2.5, + "measuring_point_description": "top of casing", + "release_status": "public", } else: # 70% don't have CompletionDate well_data = { - 'location_id': 1, - 'name': f'WELL-{i:03d}', - 'hole_depth': 100.0, - 'well_depth': 95.0, - 'measuring_point_height': 2.5, - 'measuring_point_description': 'top of casing', - 'release_status': 'public', + "location_id": 1, + "name": f"WELL-{i:03d}", + "hole_depth": 100.0, + "well_depth": 95.0, + "measuring_point_height": 2.5, + "measuring_point_description": "top of casing", + "release_status": "public", # No well_completed_on } From 0ceee93f69f5c38087558b6e71ab1b0f997a2173 Mon Sep 17 00:00:00 2001 From: jakeross Date: Fri, 28 Nov 2025 13:46:24 -0700 Subject: [PATCH 22/66] refactor: enhance asset transfer process by implementing AssetTransferer class and improving data handling --- transfers/asset_transfer.py | 125 +++++-------- transfers/group_transfer.py | 85 +++++++-- transfers/link_ids_transfer.py | 313 +++++++++++++++------------------ transfers/metrics.py | 80 +++++---- transfers/sensor_transfer.py | 284 ++++++------------------------ transfers/transfer.py | 92 +++++----- transfers/transferer.py | 43 +++-- transfers/util.py | 75 +++++--- transfers/well_transfer.py | 71 ++++---- 9 files changed, 524 insertions(+), 644 deletions(-) diff --git a/transfers/asset_transfer.py b/transfers/asset_transfer.py index 71d3ad23b..b7938f15d 100644 --- a/transfers/asset_transfer.py +++ b/transfers/asset_transfer.py @@ -13,53 +13,49 @@ # See the License for the specific language governing permissions and # limitations under the License. # =============================================================================== -# for testing only. remove later -from dotenv import load_dotenv -from db.engine import session_ctx - -load_dotenv() -# ----------------------------------------------- - import io from starlette.datastructures import UploadFile -from sqlalchemy.orm import Session -from db import Asset, AssetThingAssociation, Thing -from services.audit_helper import audit_add + +from db import Asset, AssetThingAssociation from services.gcs_helper import ( gcs_upload, - check_asset_exists, get_storage_bucket, get_storage_client, ) -from transfers.util import get_valid_things, read_csv from transfers.logger import logger - - -def transfer_assets(session: Session) -> None: - client = get_storage_client() - - bucket = get_storage_bucket(client) - logger.info(f"Using bucket {bucket.name}") - - well_photos = read_csv("WellPhotos") - # for name in ['AR0001']: # for testing - valid_things = get_valid_things(session) - n = len(valid_things) - for j, thing in enumerate(valid_things): - photos = well_photos[well_photos["PointID"] == thing.name] +from transfers.util import read_csv, filter_to_valid_point_ids +from transfers.well_transfer import WellChunkTransferer + + +class AssetTransferer(WellChunkTransferer): + def __init__(self, *args, **kw): + self.source_table = "WellPhotos" + super().__init__(*args, **kw) + self._client = get_storage_client() + self._bucket = get_storage_bucket(self._client) + logger.info(f"Using bucket {self._bucket.name}") + + def _get_dfs(self): + input_df = read_csv(self.source_table) + cleaned_df = filter_to_valid_point_ids(input_df) + return input_df, cleaned_df + + def _chunk_step(self, session, df, i, row, db_item): + photos = df[df["PointID"] == db_item.name] + n = len(df) if photos.empty: - photos = well_photos[well_photos["PointID"] == thing.name.replace("-", "")] + photos = df[df["PointID"] == db_item.name.replace("-", "")] if photos.empty: - logger.info(f"No photos found for PointID: {thing.name}") - continue + logger.info(f"No photos found for PointID: {db_item.name}") + return - for i, row in enumerate(photos.itertuples()): + for j, row in enumerate(photos.itertuples()): photo_path = row.OLEPath - srcblob = bucket.get_blob(f"nma-photos/{photo_path}") + srcblob = self._bucket.get_blob(f"nma-photos/{photo_path}") if not srcblob: logger.critical( - f"No photo found for PointID: {thing.name}, {photo_path}" + f"No photo found for PointID: {db_item.name}, {photo_path}" ) continue @@ -67,56 +63,25 @@ def transfer_assets(session: Session) -> None: f = srcblob.download_as_bytes() ff = UploadFile(file=io.BytesIO(f), filename=filename, size=len(f)) - uri, blob_name = gcs_upload(ff, bucket) - add_asset(session, ff, filename, thing.id, uri, blob_name) + uri, blob_name = gcs_upload(ff, self._bucket) + asset = Asset( + name=filename, + label=filename, + storage_path=blob_name, + storage_service="gcs", + mime_type="image/png", + size=ff.size, + uri=uri, + ) + assoc = AssetThingAssociation() + assoc.thing = db_item + assoc.asset = asset + session.add(assoc) + session.add(asset) + session.commit() logger.info( - f"Added asset {j}-{i}/{n} thing.id={thing.id} thing={thing.name} uri: {uri}" + f"Added asset {i}-{j}/{n} thing.id={db_item.id} thing={db_item.name} uri: {uri}" ) -def transfer_assets_testing(session: Session) -> None: - for p in ("asset1.png", "asset2.png", "asset3.png"): - with open(f"./transfers/data/assets/{p}", "rb") as f: - uf = UploadFile(file=f, filename=p, size=10) - uri, blob_name = gcs_upload(uf) - thing_id = 151 - - if check_asset_exists(session, blob_name, thing_id): - logger.warning(f"Asset {blob_name} already exists. Skipping.") - continue - add_asset(session, uf, p, thing_id, uri, blob_name) - - -def add_asset( - session: Session, - uf: UploadFile, - label: str, - thing_id: int, - uri: str, - blob_name: str, -) -> None: - asset = Asset( - name=label, - label=label, - storage_path=blob_name, - storage_service="gcs", - mime_type="image/png", - size=uf.size, - uri=uri, - ) - assoc = AssetThingAssociation() - audit_add({"sub": "foobar", "name": "Mr. Foobar"}, assoc) - thing = session.get(Thing, thing_id) - assoc.thing = thing - assoc.asset = asset - session.add(assoc) - session.add(asset) - session.commit() - - -if __name__ == "__main__": - - with session_ctx() as session: - transfer_assets(session) - # ============= EOF ============================================= diff --git a/transfers/group_transfer.py b/transfers/group_transfer.py index 0bad85cb7..5549a81d1 100644 --- a/transfers/group_transfer.py +++ b/transfers/group_transfer.py @@ -13,21 +13,28 @@ # See the License for the specific language governing permissions and # limitations under the License. # =============================================================================== +import pandas as pd from sqlalchemy import select from sqlalchemy.orm import Session from db import Thing, Group, GroupThingAssociation -from db.engine import session_ctx -from transfers.util import read_csv -from transfers.logger import logger from services.util import retrieve_latest_polymorphic_history_table_record +from transfers.logger import logger +from transfers.transferer import Transferer +from transfers.util import read_csv + + +class ProjectGroupTransferer(Transferer): + def __init__(self, *args, **kw): + super().__init__(*args, **kw) + self.source_table = "Projects" + self.source_dtypes = {"Project": str, "PointIDPrefix": str} + def _get_dfs(self): + df = read_csv(self.source_table, self.source_dtypes) + return df, df -def transfer_groups( - session: Session, -) -> None: - wdf = read_csv("Projects") - for i, row in enumerate(wdf.itertuples()): + def _step(self, session: Session, df: pd.DataFrame, i: int, row: pd.Series): sql = select(Group).where(Group.name == row.Project) group = session.scalars(sql).one_or_none() @@ -79,7 +86,63 @@ def transfer_groups( session.commit() -if __name__ == "__main__": - with session_ctx() as session: - transfer_groups(session) +# def transfer_groups( +# session: Session, +# ) -> None: +# wdf = read_csv("Projects") +# for i, row in enumerate(wdf.itertuples()): +# +# sql = select(Group).where(Group.name == row.Project) +# group = session.scalars(sql).one_or_none() +# if not group: +# # add a group for each project +# group = Group(name=row.Project) +# +# for prefix in row.PointIDPrefix.split(","): +# prefix = prefix.strip() +# if prefix: +# # get all PointIDs that start with prefix +# sql = select(Thing).where(Thing.name.like(f"{prefix}%")) +# records = session.scalars(sql).unique().all() +# if records: +# logger.info( +# f"Adding {len(records)} things to group {group.name}, prefix {prefix}" +# ) +# group_is_monitoring_plan = False +# for record in records: +# # set the group_type to Monitoring Plan if at least one well is currently monitored +# if not group_is_monitoring_plan: +# if record.status_history: +# monitoring_status = [ +# sh +# for sh in record.status_history +# if sh.status_type == "Monitoring Status" +# ] +# if monitoring_status: +# monitoring_status = retrieve_latest_polymorphic_history_table_record( +# record, +# "status_history", +# "Monitoring Status", +# ) +# if ( +# monitoring_status.status_value +# == "Currently monitored" +# ): +# group_is_monitoring_plan = True +# group.group_type = "Monitoring Plan" +# logger.info( +# f" Setting group {group.name} type to Monitoring Plan based on thing {record.name}" +# ) +# +# gta = GroupThingAssociation(group=group, thing=record) +# session.add(gta) +# group.thing_associations.append(gta) +# +# session.add(group) +# session.commit() +# +# +# if __name__ == "__main__": +# with session_ctx() as session: +# transfer_groups(session) # ============= EOF ============================================= diff --git a/transfers/link_ids_transfer.py b/transfers/link_ids_transfer.py index f11f8bb97..dbb33f76f 100644 --- a/transfers/link_ids_transfer.py +++ b/transfers/link_ids_transfer.py @@ -24,184 +24,161 @@ extract_organization, read_csv, replace_nans, - chunk_by_size, ) +from transfers.well_transfer import WellChunkTransferer + + +class LinkIdsWellDataTransferer(WellChunkTransferer): + source_table = "WellData" + source_dtypes = {"OSEWellID": str, "OSEWelltagID": str} + + def _chunk_step(self, session, dr, i, row, db_item): + if pd.isna(row.OSEWellID) and pd.isna(row.OSEWelltagID): + return + + for aid, klass, regex in ( + (row.OSEWellID, "OSEPOD", r"^[A-Z]{1,3}-\d{3,6}"), + ( + row.OSEWelltagID, + "OSEWellTagID", + r"", + ), # TODO: need to figure out regex for this field + ): + if pd.isna(aid): + # logger.warning(f"{klass} is null for {row.PointID}") + continue + print("aid", aid, type(aid)) + # RULE: exclude any id that == 'X', '?' + if aid.strip().lower() in ("x", "?", "exempt"): + logger.critical( + f'{klass} is "X", "?", or "exempt", id={aid} for {row.PointID}' + ) + continue + if regex and not re.match(regex, aid): + logger.critical( + f"{klass} id does not match regex {regex}, id={aid} for {row.PointID}" + ) + continue -def transfer_link_ids_welldata(session): - ldf = read_csv("WellData", dtype={"OSEWelltagID": str}) + # TODO: add guards for null values + link_id = ThingIdLink() + link_id.thing = db_item + link_id.relation = klass + link_id.alternate_id = aid + link_id.alternate_organization = "NMOSE" - ldf = filter_to_valid_point_ids(session, ldf) + # does link_id need a class e.g. + # link_id.alternate_id_class = klass - for chunk in chunk_by_size(ldf, 100): - things = ( - session.query(Thing).filter(Thing.name.in_(chunk.PointID.tolist())).all() - ) - for row in chunk.itertuples(): - # RULE: exclude rows where both ids are null - if pd.isna(row.OSEWellID) and pd.isna(row.OSEWelltagID): - # logger.warning( - # f"Both OSEWellID and OSEWelltagID are null for {row.PointID}" - # ) - continue + session.add(link_id) - thing = next((l for l in things if l.name == row.PointID), None) - if thing is None: - logger.warning( - f"Thing not found forPointID {row.PointID}. Skipping link ids." - ) - continue - for aid, klass, regex in ( - (row.OSEWellID, "OSEPOD", r"^[A-Z]{1,3}-\d{3,6}"), - ( - row.OSEWelltagID, - "OSEWellTagID", - r"", - ), # TODO: need to figure out regex for this field - ): - if pd.isna(aid): - # logger.warning(f"{klass} is null for {row.PointID}") - continue - - # RULE: exclude any id that == 'X', '?' - if aid.strip().lower() in ("x", "?", "exempt"): - logger.critical( - f'{klass} is "X", "?", or "exempt", id={aid} for {row.PointID}' - ) - continue - - if regex and not re.match(regex, aid): - logger.critical( - f"{klass} id does not match regex {regex}, id={aid} for {row.PointID}" - ) - continue - - # TODO: add guards for null values - link_id = ThingIdLink() - link_id.thing = thing - link_id.relation = klass - link_id.alternate_id = aid - link_id.alternate_organization = "NMOSE" - - # does link_id need a class e.g. - # link_id.alternate_id_class = klass - - session.add(link_id) - session.commit() - - -def add_link_alternate_site_id(session, row, thing): - if not row.AlternateSiteID: - return - - link_id = ThingIdLink() - link_id.thing = thing - link_id.relation = "same_as" - link_id.alternate_id = row.AlternateSiteID - - link_id.alternate_organization = extract_organization(str(row.AlternateSiteID)) - - # logger.info(f"adding link id: {row.PointID}") - session.add(link_id) - - -def add_link_site_id(session, row, thing): - if not row.SiteID: - return - - link_id = ThingIdLink() - link_id.thing = thing - link_id.relation = "same_as" - - site_id = row.SiteID.strip() - if not re.match(r"^\d{15}$", site_id): - # TODO: lets make a sweet function for flagging issues - # flag for interrogation - logger.critical( - f"{row.PointID} alternate id {site_id} is not a valid USGS site id" +class LinkIdsLocationDataTransferer(WellChunkTransferer): + source_table = "Location" + site_type = "GW" + + def __init__(self, *args, **kw): + super().__init__(*args, **kw) + + self._plss_regex = re.compile( + r"^T\d{1,3}[NS]\.R\d{1,3}[EW]\.S(?:[1-9]|[12]\d|3[0-6])(?:\.\d{1,5})?$" ) - return - - link_id.alternate_id = row.SiteID - link_id.alternate_organization = "USGS" - session.add(link_id) - - -def add_link_plss(session, row, thing): - - township = row.Township - township_direction = row.TownshipDirection - _range = row.Range - range_direction = row.RangeDirection - section = row.Section - section_direction = row.SectionDirection - - if not township or not _range or not section: - return - - link_id = ThingIdLink() - link_id.thing = thing - link_id.relation = "same_as" - link_id.alternate_organization = "PLSS" - - alternate_id = f"T{township}{township_direction}.R{_range}{range_direction}.S{section}{section_direction}" - if not re.match(r"T\d{1,3}.R\d{1,3}.S\d{1,3}", alternate_id): - # flag for interrogation - logger.warning(f"alternate id {alternate_id} is not a valid PLSS") - return - link_id.alternate_id = alternate_id - link_id.alternate_organization = "PLSS" - session.add(link_id) - - -def transfer_link_ids(session, site_type="GW"): - ldf = read_csv("Location") - ldf = ldf[ldf["SiteType"] == site_type] - ldf = ldf[ldf["Easting"].notna() & ldf["Northing"].notna()] - ldf = replace_nans(ldf) - - ldf = filter_to_valid_point_ids(session, ldf) - for chunk in chunk_by_size(ldf, 100): - locations = ( - session.query(Thing).filter(Thing.name.in_(chunk.PointID.tolist())).all() + self._usgs_regex = re.compile(r"^\d{15}$") + + def _get_dfs(self): + input_df = read_csv( + self.source_table, + { + "SiteID": str, + "Township": str, + "TownshipDirection": str, + "Range": str, + "RangeDirection": str, + "SectionQuarters": str, + }, ) - for row in chunk.itertuples(): - thing = next((l for l in locations if l.name == row.PointID), None) - if thing is None: - logger.warning( - f"Thing with PointID {row.PointID} not found. Skipping link id." - ) - continue - logger.info( - f"Processing PointID: {row.PointID}, Thing ID: {thing.id}, AlternateSiteID={row.AlternateSiteID}, " - f"AlternateSiteID2={row.AlternateSiteID2}" + + ldf = input_df[input_df["SiteType"] == self.site_type] + ldf = ldf[ldf["Easting"].notna() & ldf["Northing"].notna()] + ldf = replace_nans(ldf) + cleaned_df = filter_to_valid_point_ids(ldf) + return input_df, cleaned_df + + def _chunk_step(self, session, df, i, row, db_item): + logger.info( + f"Processing PointID: {row.PointID}, " + f"Thing ID: {db_item.id}, " + f"AlternateSiteID={row.AlternateSiteID}, " + f"AlternateSiteID2={row.AlternateSiteID2}" + ) + for func in ( + self._add_link_alternate_site_id, + self._add_link_site_id, + self._add_link_plss, + ): + link = func(row, db_item) + if link: + session.add(link) + + def _add_link_alternate_site_id(self, row: pd.Series, thing: Thing): + if not row.AlternateSiteID: + return + + return _make_thing_id_link( + thing, row.AlternateSiteID, extract_organization(str(row.AlternateSiteID)) + ) + + def _add_link_site_id(self, row, thing): + if not row.SiteID: + return + + site_id = row.SiteID.strip() + if not self._usgs_regex.match(site_id): + self._capture_error( + row.PointID, f"{site_id} is not a valid USGS site id", "SiteID" + ) + logger.critical( + f"{row.PointID} alternate id {site_id} is not a valid USGS site id" + ) + return + + return _make_thing_id_link(thing, row.SiteID, "USGS") + + def _add_link_plss(self, row, thing): + township = row.Township + township_direction = row.TownshipDirection + _range = row.Range + range_direction = row.RangeDirection + section = row.SectionQuarters + if not township or not _range or not section: + return + + alternate_id = ( + f"T{township}{township_direction}.R{_range}{range_direction}.S{section}" + ) + if not self._plss_regex.match(alternate_id): + self._capture_error( + row.PointID, + f"{alternate_id} is not a valid PLSS", + "Township, TownshipDirection, Range, RangeDirection, Section, SectionDirection", ) - add_link_alternate_site_id(session, row, thing) - session.commit() - - # for i, row in enumerate(ldf.itertuples()): - # thing = session.query(Thing).where(Thing.name == row.PointID).first() - # if thing is None: - # logger.warning( - # f"Thing with PointID {row.PointID} not found. Skipping link id." - # ) - # continue - # logger.info( - # f"Processing PointID: {row.PointID}, Thing ID: {thing.id}, AlternateSiteID={row.AlternateSiteID}, " - # f"AlternateSiteID2={row.AlternateSiteID2}" - # ) - # add_link_alternate_site_id(session, row, thing) - # # add_link_site_id(session, row, thing) - # # add_link_plss(session, row, thing) - # - # # not clear what alternate_id2 is for, or what it maps to - # # add_link_alternate_site_id2(session, row, thing) - # if i and not i % 25: - # session.commit() - # session.flush() - # - # session.commit() + + logger.critical(f"alternate id {alternate_id} is not a valid PLSS") + return + + return _make_thing_id_link(thing, alternate_id, "PLSS") + + +def _make_thing_id_link( + thing, alternate_id, alternate_organization, relation="same_as" +): + return ThingIdLink( + thing=thing, + relation=relation, + alternate_id=alternate_id, + alternate_organization=alternate_organization, + ) # ============= EOF ============================================= diff --git a/transfers/metrics.py b/transfers/metrics.py index 25b6b626b..1f2b67bdd 100644 --- a/transfers/metrics.py +++ b/transfers/metrics.py @@ -22,7 +22,6 @@ from pydantic import ValidationError from sqlalchemy import select, func from sqlalchemy.exc import ProgrammingError -from sqlalchemy.orm import Session from db import ( Thing, @@ -33,7 +32,10 @@ Parameter, Deployment, TransducerObservation, + Group, + Asset, ) +from db.engine import session_ctx from services.gcs_helper import get_storage_bucket @@ -77,9 +79,24 @@ def sensor_metrics(self, *args, **kw) -> None: def well_screen_metrics(self, *args, **kw) -> None: self._handle_metrics(WellScreen, *args, **kw) - def contact_metrics(self, sess, input_df, cleaned_df, errors) -> None: + def welldata_link_ids_metrics(self, input_df, cleaned_df, errors) -> None: + self._write_metrics("WellData Link IDs", len(input_df), input_df, cleaned_df) + self._write_errors(errors) + + def location_link_ids_metrics(self, input_df, cleaned_df, errors) -> None: + self._write_metrics( + "LocationData Link IDs", len(input_df), input_df, cleaned_df + ) + self._write_errors(errors) + + def asset_metrics(self, *args, **kw) -> None: + self._handle_metrics(Asset, *args, **kw) + + def group_metrics(self, *args, **kw) -> None: + self._handle_metrics(Group, *args, **kw) + + def contact_metrics(self, input_df, cleaned_df, errors) -> None: count = self._get_count( - sess, Contact, ) @@ -90,14 +107,15 @@ def contact_metrics(self, sess, input_df, cleaned_df, errors) -> None: self._writer.writerow(metrics) self._write_errors(errors) - def water_level_metrics(self, sess, input_df, cleaned_df, errors) -> None: - sql = ( - select(func.count()) - .select_from(Observation) - .join(Parameter) - .where(Parameter.parameter_name == "groundwater level") - ) - count = sess.execute(sql).scalar_one() + def water_level_metrics(self, input_df, cleaned_df, errors) -> None: + with session_ctx() as sess: + sql = ( + select(func.count()) + .select_from(Observation) + .join(Parameter) + .where(Parameter.parameter_name == "groundwater level") + ) + count = sess.execute(sql).scalar_one() metrics = self._make_metrics( "Manual Water Levels", len(input_df), len(cleaned_df), count @@ -111,19 +129,18 @@ def acoustic_metrics(self, *args, **kw) -> None: def pressure_metrics(self, *args, **kw) -> None: self._transducer_metrics("Pressure Transducer", *args, **kw) - def _transducer_metrics( - self, sensor_type, sess, input_df, cleaned_df, errors - ) -> None: - sql = ( - select(func.count()) - .select_from(TransducerObservation) - .join(Deployment) - .join(Sensor) - .join(Parameter) - .where(Sensor.sensor_type == sensor_type) - .where(Parameter.parameter_name == "groundwater level") - ) - count = sess.execute(sql).scalar_one() + def _transducer_metrics(self, sensor_type, input_df, cleaned_df, errors) -> None: + with session_ctx() as sess: + sql = ( + select(func.count()) + .select_from(TransducerObservation) + .join(Deployment) + .join(Sensor) + .join(Parameter) + .where(Sensor.sensor_type == sensor_type) + .where(Parameter.parameter_name == "groundwater level") + ) + count = sess.execute(sql).scalar_one() metrics = self._make_metrics(sensor_type, len(input_df), len(cleaned_df), count) self._writer.writerow(metrics) self._write_errors(errors) @@ -133,9 +150,9 @@ def _make_metrics(self, name, input_n, cleaned_n, count): return [name, input_n, cleaned_n, count, percent_issue] def _handle_metrics( - self, model, sess, input_df, cleaned_df, errors, where=None, name=None + self, model, input_df, cleaned_df, errors, where=None, name=None ) -> None: - count = self._get_count(sess, model, where=where) + count = self._get_count(model, where=where) if name is None: name = model.__name__ @@ -183,11 +200,12 @@ def _write_metrics( metrics = self._make_metrics(name, len(input_df), len(cleaned_df), count) self._writer.writerow(metrics) - def _get_count(self, sess: Session, model, where=None) -> int: - sql = select(func.count()).select_from(model) - if where: - sql = sql.where(where) - count = sess.execute(sql).scalar_one() + def _get_count(self, model, where=None) -> int: + with session_ctx() as sess: + sql = select(func.count()).select_from(model) + if where: + sql = sql.where(where) + count = sess.execute(sql).scalar_one() return count diff --git a/transfers/sensor_transfer.py b/transfers/sensor_transfer.py index 6c9a75cbc..76f9f4fe9 100644 --- a/transfers/sensor_transfer.py +++ b/transfers/sensor_transfer.py @@ -15,16 +15,18 @@ # =============================================================================== from datetime import datetime +import pandas as pd from sqlalchemy import select +from sqlalchemy.orm import Session -from db import Sensor, Deployment, Thing +from db import Sensor, Deployment, Thing, Base from transfers.transferer import ThingBasedTransferer from transfers.util import ( read_csv, logger, filter_to_valid_point_ids, replace_nans, - RecordingIntervalEstimator, + SensorParameterEstimator, ) EQUIPMENT_TO_SENSOR_TYPE_MAP = { @@ -42,11 +44,11 @@ def __init__(self, *args, **kwargs): self._estimators = {} self._added = {} - def _get_dfs(self, session): + def _get_dfs(self): input_df = read_csv(self.source_table) input_df.columns = input_df.columns.str.replace(" ", "_") input_df = input_df[input_df.SerialNo.notna()] - cleaned_df = filter_to_valid_point_ids(session, input_df) + cleaned_df = filter_to_valid_point_ids(input_df) cleaned_df = replace_nans(cleaned_df) return input_df, cleaned_df @@ -56,7 +58,15 @@ def _no_db_item_warning(self, index): def _get_prepped_group(self, group): return group.sort_values(by=["DateInstalled"]) - def _step(self, session, row, db_item): + def _get_estimator(self, sensor_type): + if sensor_type in self._estimators: + estimator = self._estimators[sensor_type] + else: + estimator = SensorParameterEstimator(sensor_type) + self._estimators[sensor_type] = estimator + return estimator + + def _group_step(self, session: Session, row: pd.Series, db_item: Base): pointid = self._get_point_id(row, db_item) try: @@ -66,14 +76,8 @@ def _step(self, session, row, db_item): f"Skipping equipment with type {row.EquipmentType} for point {pointid}" ) error = f"key error adding sensor_type:{row.EquipmentType} error: {e}" - self.errors.append( - { - "pointid": pointid, - "error": error, - "table": self.source_table, - "field": "EquipmentType", - } - ) + self._capture_error(pointid, error, "EquipmentType") + return if row.SerialNo in self._added: @@ -114,21 +118,29 @@ def _step(self, session, row, db_item): row.DateInstalled, "%Y-%m-%d %H:%M:%S.%f" ).date() else: - pointid = self._get_point_id(row) - logger.critical( - f"Installation Date cannot be None. Skipping deployment. Sensor: {row.ID}, " - f"SerialNo: {row.SerialNo} PointID: {pointid}" - ) - self.errors.append( - { - "pointid": pointid, - "error": f"row.ID={row.ID}, row.SerialNo={row.SerialNo}. Installation Date cannot " - f"be None", - "table": self.source_table, - "field": "DateInstalled", - } - ) - return + pointid = self._get_point_id(row, None) + estimator = self._get_estimator(sensor_type) + installation_date = estimator.estimate_installation_date(row) + if not installation_date: + logger.critical( + f"Installation Date cannot be None. Skipping deployment. Sensor: {row.ID}, " + f"SerialNo: {row.SerialNo} PointID: {pointid}" + ) + self._capture_error( + pointid, + f"row.SerialNo={row.SerialNo}. Installation Date cannot be None", + "DateInstalled", + ) + return + else: + logger.warning( + f"Estimated installation date={installation_date} for {pointid}" + ) + self._capture_error( + pointid, + f"Estimated installation date={installation_date}. Is this correct?", + "DateInstalled", + ) removal_date = None if row.DateRemoved: @@ -141,12 +153,7 @@ def _step(self, session, row, db_item): recording_interval = int(row.RecordingInterval) except (ValueError, TypeError): # try to calculate recording interval from measurements - if sensor_type in self._estimators: - estimator = self._estimators[sensor_type] - else: - estimator = RecordingIntervalEstimator(sensor_type) - self._estimators[sensor_type] = estimator - + estimator = self._get_estimator(sensor_type) recording_interval, unit, error = estimator.estimate_recording_interval( row, installation_date, removal_date ) @@ -157,18 +164,20 @@ def _step(self, session, row, db_item): f"name={sensor.name}, serial_no={sensor.serial_no}. " f"estimated recording interval: {recording_interval} {unit}" ) + self._capture_error( + pointid, + f"Estimated recording interval={recording_interval} {unit}. Is this correct?", + "RecordingInterval", + ) + else: logger.critical( f"name={sensor.name}, serial_no={sensor.serial_no} error={error}" ) - - self.errors.append( - { - "pointid": pointid, - "error": f"name={sensor.name}, row.SerialNo={row.SerialNo}. error={error}", - "table": self.source_table, - "field": "RecordingInterval", - } + self._capture_error( + pointid, + f"name={sensor.name}, row.SerialNo={row.SerialNo}. error={error}", + "RecordingInterval", ) sql = ( @@ -217,195 +226,4 @@ def _step(self, session, row, db_item): sensor.sensor_status = "Retired" -# def transfer_sensors(session): -# source_table = "Equipment" -# input_df = read_csv(source_table) -# input_df.columns = input_df.columns.str.replace(" ", "_") -# input_df = input_df[input_df.SerialNo.notna()] -# cleaned_df = filter_to_valid_point_ids(session, input_df) -# cleaned_df = replace_nans(cleaned_df) -# errors = [] -# grouped_equipment = cleaned_df.groupby(["PointID"]) -# added = {} -# estimators = {} -# for index, group in grouped_equipment: -# pointid = index[0] -# thing = session.query(Thing).filter(Thing.name == pointid).first() -# if thing is None: -# logger.warning( -# f"Skipping sensor transfer for Thing with PointID {pointid} since it is not in the DB" -# ) -# continue -# ordered_group = group.sort_values(by=["DateInstalled"]) -# -# try: -# for row in ordered_group.itertuples(): -# try: -# sensor_type = EQUIPMENT_TO_SENSOR_TYPE_MAP[row.EquipmentType] -# except KeyError as e: -# logger.critical( -# f"Skipping equipment with type {row.EquipmentType} for point {pointid}" -# ) -# error = ( -# f"key error adding sensor_type:{row.EquipmentType} error: {e}" -# ) -# errors.append( -# { -# "pointid": pointid, -# "error": error, -# "table": source_table, -# "field": "EquipmentType", -# } -# ) -# continue -# -# if row.SerialNo in added: -# logger.info( -# f"Sensor with serial number {row.SerialNo} already added in this transfer session. Only creating deployment for that record" -# ) -# sensor = added[row.SerialNo] -# else: -# sensor = ( -# session.query(Sensor) -# .filter(Sensor.serial_no == row.SerialNo) -# .one_or_none() -# ) -# if sensor: -# logger.info( -# f"Sensor with serial number {row.SerialNo} already exists. Only creating deployment for that record" -# ) -# -# if not sensor: -# # TODO: Add validation -# sensor = Sensor( -# nma_pk_equipment=row.GlobalID, -# name=row.ID, -# sensor_type=sensor_type, -# model=row.Model, -# serial_no=row.SerialNo, -# owner_agency="NMBGMR", -# notes=row.Equipment_Notes, -# ) -# added[row.SerialNo] = sensor -# session.add(sensor) -# logger.info( -# f"Added sensor {sensor.name} with serial number {sensor.serial_no}" -# ) -# -# if row.DateInstalled: -# installation_date = datetime.strptime( -# row.DateInstalled, "%Y-%m-%d %H:%M:%S.%f" -# ).date() -# else: -# logger.critical( -# f"Installation Date cannot be None. Skipping deployment. Sensor: {row.ID}, " -# f"SerialNo: {row.SerialNo} PointID: {pointid}" -# ) -# errors.append( -# { -# "pointid": pointid, -# "error": f"row.ID={row.ID}, row.SerialNo={row.SerialNo}. Installation Date cannot " -# f"be None", -# "table": source_table, -# "field": "DateInstalled", -# } -# ) -# continue -# -# removal_date = None -# if row.DateRemoved: -# removal_date = datetime.strptime( -# row.DateRemoved, "%Y-%m-%d %H:%M:%S.%f" -# ).date() -# -# recording_interval_unit = "hour" -# try: -# recording_interval = int(row.RecordingInterval) -# except (ValueError, TypeError): -# error = "RecordingInterval is not an integer" -# # try to calculate recording interval from measurements -# if sensor_type in estimators: -# estimator = estimators[sensor_type] -# else: -# estimator = RecordingIntervalEstimator(sensor_type) -# estimators[sensor_type] = estimator -# -# recording_interval, unit, error = ( -# estimator.estimate_recording_interval( -# row, installation_date, removal_date -# ) -# ) -# -# if recording_interval: -# recording_interval_unit = unit -# logger.info( -# f"name={sensor.name}, serial_no={sensor.serial_no}. " -# f"estimated recording interval: {recording_interval} {unit}" -# ) -# else: -# logger.critical( -# f"name={sensor.name}, serial_no={sensor.serial_no} error={error}" -# ) -# errors.append( -# { -# "pointid": pointid, -# "error": f"name={sensor.name}, row.SerialNo={row.SerialNo}. error={error}", -# "table": source_table, -# "field": "RecordingInterval", -# } -# ) -# sql = ( -# select(Deployment) -# .join(Thing) -# .join(Sensor) -# .where(Thing.name == pointid) -# .where(Sensor.serial_no == sensor.serial_no) -# .where(Deployment.installation_date == installation_date) -# .where(Deployment.removal_date == removal_date) -# ) -# -# existing_deployment = session.execute(sql).scalars().one_or_none() -# if existing_deployment: -# logger.info("existing deployment") -# continue -# -# # TODO: add validation -# deployment = Deployment( -# thing=thing, -# sensor=sensor, -# installation_date=installation_date, -# removal_date=removal_date, -# recording_interval=recording_interval, -# recording_interval_units=recording_interval_unit, -# hanging_cable_length=row.HangingCableLength, -# hanging_point_height=row.HangingPointHgt, -# hanging_point_description=row.HangingPointDescription, -# ) -# session.add(deployment) -# logger.info( -# f"Added deployment for sensor with serial number {sensor.serial_no}, deployed to {thing.name}: | Installation Date: {installation_date} | Removal Date: {removal_date}" -# ) -# -# """ -# Developer's notes -# -# Since it's unclear beforehand if a sensor has been removed just update -# the sensor_status based off of each deployments installation/removal -# dates -# """ -# if installation_date: -# sensor.sensor_status = "In Service" -# if removal_date: -# sensor.sensor_status = "Retired" -# session.commit() -# except Exception as e: -# import traceback -# -# traceback.print_exc() -# logger.critical(f"Could not add sensor and deployment: {e}") -# errors.append({"pointid": pointid, "error": e, "table": source_table}) -# -# return input_df, cleaned_df, errors - - # ============= EOF ============================================= diff --git a/transfers/transfer.py b/transfers/transfer.py index a2d7544a9..97086d10b 100644 --- a/transfers/transfer.py +++ b/transfers/transfer.py @@ -25,16 +25,18 @@ transfer_water_levels_acoustic, ) from core.initializers import erase_and_rebuild_db -from db.engine import session_ctx -from transfers.group_transfer import transfer_groups -from transfers.link_ids_transfer import transfer_link_ids, transfer_link_ids_welldata +from transfers.group_transfer import ProjectGroupTransferer +from transfers.link_ids_transfer import ( + LinkIdsWellDataTransferer, + LinkIdsLocationDataTransferer, +) from transfers.contact_transfer import transfer_contacts from transfers.sensor_transfer import SensorTransferer from transfers.waterlevels_transfer import transfer_water_levels from transfers.well_transfer import WellTransferer, WellScreenTransferer -from transfers.asset_transfer import transfer_assets +from transfers.asset_transfer import AssetTransferer from transfers.util import timeit, timeit_direct from transfers.logger import logger, save_log_to_bucket @@ -112,14 +114,18 @@ def transfer_all(sess, metrics, limit=100): or "Field duplicate") """ message("TRANSFERRING LINK IDS") - timeit_direct(transfer_link_ids, sess) - timeit_direct(transfer_link_ids_welldata, sess) + results = _execute_transfer(LinkIdsWellDataTransferer, flags=flags) + metrics.welldata_link_ids_metrics(*results) + results = _execute_transfer(LinkIdsLocationDataTransferer, flags=flags) + metrics.location_link_ids_metrics(*results) message("TRANSFERRING GROUPS") - timeit_direct(transfer_groups, sess) + results = _execute_transfer(ProjectGroupTransferer, flags=flags) + metrics.group_metrics(*results) message("TRANSFERRING ASSETS") - timeit_direct(transfer_assets, sess) + results = _execute_transfer(AssetTransferer, flags=flags) + metrics.asset_metrics(*results) def _execute_transfer(klass, flags: dict = None): @@ -128,28 +134,26 @@ def _execute_transfer(klass, flags: dict = None): return transferer.input_df, transferer.cleaned_df, transferer.errors -def transfer_debugging(sess, metrics, limit=100): +def transfer_debugging(metrics, limit=100): message("STARTING TRANSFER DEBUG", new_line_at_top=False) if int(os.environ.get("ERASE_AND_REBUILD", 0)): logger.info("Erase and rebuilding database") erase_and_rebuild_db() - message("TRANSFERRING WELLS") - flags = {"TRANSFER_ALL_WELLS": True, "LIMIT": limit} # not currently used + message("TRANSFERRING WELLS") results = _execute_transfer(WellTransferer, flags=flags) - metrics.well_metrics(sess, *results) + metrics.well_metrics(*results) message("TRANSFERRING WELL SCREENS") results = _execute_transfer(WellScreenTransferer, flags=flags) - metrics.well_screen_metrics(sess, *results) + metrics.well_screen_metrics(*results) message("TRANSFERRING SENSORS") results = _execute_transfer(SensorTransferer, flags=flags) - # results = timeit_direct(transfer_sensors, sess) - metrics.sensor_metrics(sess, *results) + metrics.sensor_metrics(*results) # Developer's notes all the metadata for these Things are not defined in the models/schemas yet' # message("TRANSFERRING SPRINGS") @@ -164,42 +168,46 @@ def transfer_debugging(sess, metrics, limit=100): # message("TRANSFERRING METEOROLOGICAL") # timeit_direct(transfer_met, sess, limit) - message("TRANSFERRING CONTACTS") - results = timeit_direct(transfer_contacts, sess) - metrics.contact_metrics(sess, *results) - - message("TRANSFERRING WATER LEVELS") - results = timeit_direct(transfer_water_levels, sess) - metrics.water_level_metrics(sess, *results) - - message("TRANSFERRING WATER LEVELS PRESSURE") - results = timeit_direct(transfer_water_levels_pressure, sess) - metrics.pressure_metrics(sess, *results) - - message("TRANSFERRING WATER LEVELS ACOUSTIC") - results = timeit_direct(transfer_water_levels_acoustic, sess) - metrics.acoustic_metrics(sess, *results) + # message("TRANSFERRING CONTACTS") + # results = timeit_direct(transfer_contacts, sess) + # metrics.contact_metrics(sess, *results) + # + # message("TRANSFERRING WATER LEVELS") + # results = timeit_direct(transfer_water_levels, sess) + # metrics.water_level_metrics(sess, *results) + # + # message("TRANSFERRING WATER LEVELS PRESSURE") + # results = timeit_direct(transfer_water_levels_pressure, sess) + # metrics.pressure_metrics(sess, *results) + # + # message("TRANSFERRING WATER LEVELS ACOUSTIC") + # results = timeit_direct(transfer_water_levels_acoustic, sess) + # metrics.acoustic_metrics(sess, *results) - # message("TRANSFERRING LINK IDS") - # timeit_direct(transfer_link_ids, sess) - # timeit_direct(transfer_link_ids_welldata, sess) + message("TRANSFERRING LINK IDS") + results = _execute_transfer(LinkIdsWellDataTransferer, flags=flags) + metrics.welldata_link_ids_metrics(*results) + results = _execute_transfer(LinkIdsLocationDataTransferer, flags=flags) + metrics.location_link_ids_metrics(*results) - # message("TRANSFERRING GROUPS") - # timeit_direct(transfer_groups, sess) + message("TRANSFERRING GROUPS") + results = _execute_transfer(ProjectGroupTransferer, flags=flags) + metrics.group_metrics(*results) - # message("TRANSFERRING ASSETS") - # timeit_direct(transfer_assets, sess) + message("TRANSFERRING ASSETS") + results = _execute_transfer(AssetTransferer, flags=flags) + metrics.asset_metrics(*results) def main(): message("START--------------------------------------") limit = int(os.getenv("TRANSFER_LIMIT", 1000)) metrics = Metrics() - with session_ctx() as sess: - if int(os.getenv("TRANSFER_DEBUG", 0)): - transfer_debugging(sess, metrics, limit=limit) - else: - transfer_all(sess, metrics, limit=limit) + + if int(os.getenv("TRANSFER_DEBUG", 0)): + transfer_debugging(metrics, limit=limit) + else: + transfer_all(metrics, limit=limit) metrics.close() metrics.save_to_storage_bucket() diff --git a/transfers/transferer.py b/transfers/transferer.py index 273462585..8d84e1170 100644 --- a/transfers/transferer.py +++ b/transfers/transferer.py @@ -19,28 +19,47 @@ from pandas import DataFrame from sqlalchemy.orm import Session -from db import Thing +from db import Thing, Base from db.engine import session_ctx from transfers.logger import logger from transfers.util import chunk_by_size +class ManualFixer(object): + pass + + class Transferer(object): input_df: pd.DataFrame = None cleaned_df: pd.DataFrame = None errors: list = None flags: dict = None + source_table: str = None def __init__(self, flags: dict = None): self.errors = [] self.flags = flags if flags else {} + self.manual_fixer = ManualFixer() def transfer(self): with session_ctx() as session: - self.input_df, self.cleaned_df = self._get_dfs(session) + self.input_df, self.cleaned_df = self._get_dfs() self._transfer_hook(session) session.commit() + def _capture_error(self, pointid, error, field, table=None): + if table is None: + table = self.source_table + + self.errors.append( + { + "pointid": pointid, + "error": error, + "table": table, + "field": field, + } + ) + def _transfer_hook(self, session: Session): self._limit_iterator(session, self.flags.get("LIMIT", 0)) @@ -68,18 +87,18 @@ def _limit_iterator(self, session: Session, limit: int, step: int = 25): session.rollback() continue - self._iterator(session, df, i, row) + self._step(session, df, i, row) session.commit() self._after_hook(session) - def _iterator(self, session: Session, df: pd.DataFrame, i: int, row: dict): + def _step(self, session: Session, df: pd.DataFrame, i: int, row: dict): raise NotImplementedError("Must implement _iterator method") def _after_hook(self, session: Session): pass - def _get_dfs(self, session: Session): + def _get_dfs(self): raise NotImplementedError("Must implement _get_dfs method") @@ -100,7 +119,7 @@ def _transfer_hook(self, session: Session): if not dbitem: self._missing_db_item_warning(row) continue - self._chunk_iterator(session, df, i, row, dbitem) + self._chunk_step(session, df, i, row, dbitem) # def chunk_transfer(self): # with session_ctx() as session: @@ -125,7 +144,7 @@ def _get_df_chunk(self, session, chunk): def _missing_db_item_warning(self, row): raise NotImplementedError("Must be implemented in subclass") - def _chunk_iterator(self, session, df, i, row, dbitem): + def _chunk_step(self, session, df, i, row, dbitem): raise NotImplementedError("Must be implemented in subclass") def _get_db_item(self, chunk, row): @@ -150,21 +169,19 @@ def _group_iterator(self, session: Session): prepped_group = self._get_prepped_group(group) for row in prepped_group.itertuples(): try: - self._step(session, row, db_item) + self._group_step(session, row, db_item) except Exception as e: import traceback pointid = self._get_point_id(row, db_item) traceback.print_exc() logger.critical(f"Could not add sensor and deployment: {e}") - self.errors.append( - {"pointid": pointid, "error": e, "table": self.source_table} - ) + self._capture_error(pointid, e, "UnknownField") - def _get_point_id(self, row, db_item) -> str: + def _get_point_id(self, row: pd.Series, db_item: Base) -> str: return row.PointID - def _step(self, session: Session, row, db_item): + def _group_step(self, session: Session, row: pd.Series, db_item: Base): raise NotImplementedError("Must be implemented in subclass") def _get_prepped_group(self, group) -> DataFrame: diff --git a/transfers/util.py b/transfers/util.py index 023d4a397..70e6952a5 100644 --- a/transfers/util.py +++ b/transfers/util.py @@ -26,10 +26,10 @@ import pytz from shapely import Point from sqlalchemy import select -from sqlalchemy.orm import Session from constants import SRID_WGS84, SRID_UTM_ZONE_13N from db import Thing, Location, DataProvenance +from db.engine import session_ctx from services.gcs_helper import get_storage_bucket # from services.lexicon_mapper import lexicon_mapper @@ -72,22 +72,26 @@ def estimate_measuring_point_height( logger.info( f"No MPHeight found for PointID: {row.PointID}. Estimating from measurements." ) - # try to estimate mpheight from measurements mphs = [] start_dates = [] mph_descs = [] - for m in df.itertuples(): - mphi = m.DepthToWater - m.DepthToWaterBGS - start_date = m.DateMeasured - if mphi not in mphs: - mphs.append(mphi) - mph_descs.append( - "Auto calculated from measurements at depth to water and depth to water below ground surface" - ) - start_dates.append(start_date) - logger.info( - f"Estimated MPHeight: {mphs}, {start_dates} for PointID: {row.PointID}." - ) + + if len(df) == 0: + logger.warning(f"No measurements found for PointID: {row.PointID}.") + else: + # try to estimate mpheight from measurements + for m in df.itertuples(): + mphi = m.DepthToWater - m.DepthToWaterBGS + start_date = m.DateMeasured + if mphi not in mphs: + mphs.append(mphi) + mph_descs.append( + "Auto calculated from measurements at depth to water and depth to water below ground surface" + ) + start_dates.append(start_date) + logger.info( + f"Estimated MPHeight: {mphs}, {start_dates} for PointID: {row.PointID}." + ) else: mphs = [mph] mph_descs = [mph_desc] @@ -105,7 +109,7 @@ def estimate_measuring_point_height( return zip(mphs, mph_descs, start_dates, end_dates) -class RecordingIntervalEstimator: +class SensorParameterEstimator: def __init__(self, sensor_type: str): if sensor_type == "Pressure Transducer": self._df = read_csv("WaterLevelsContinuous_Pressure") @@ -115,6 +119,23 @@ def __init__(self, sensor_type: str): # convert "DateMeasured" to date" self._df["DateMeasured"] = pd.to_datetime(self._df["DateMeasured"]).dt.date + def estimate_installation_date( + self, record: pd.Series + ) -> tuple[datetime | None, str | None]: + # get the first measurement for this pointid + point_id = record.PointID + cdf = self._get_values(point_id) + if len(cdf) == 0: + logger.warning( + f"Unable to estimate installation date, no measurements found for PointID: {point_id}." + ) + return None + return cdf["DateMeasured"].min() + + def _get_values(self, point_id: str): + cdf = self._df[self._df["PointID"] == point_id] + return cdf.sort_values("DateMeasured") + def estimate_recording_interval( self, record: pd.Series, @@ -122,12 +143,10 @@ def estimate_recording_interval( removal_date: datetime = None, ) -> tuple[int | None, str | None, str | None]: point_id = record.PointID - - cdf = self._df[self._df["PointID"] == point_id] + cdf = self._get_values(point_id) if len(cdf) == 0: return None, None, f"No measurements found for PointID: {point_id}" - cdf = cdf.sort_values("DateMeasured") if installation_date is not None: cdf = cdf[cdf["DateMeasured"] >= installation_date] if removal_date is not None: @@ -203,9 +222,10 @@ def read_csv(name: str, dtype: dict | None = None) -> pd.DataFrame: return pd.read_csv(io.BytesIO(data)) -def get_valid_point_ids(session, thing_type="water well"): - things = get_valid_things(session, thing_type) - valid_pointids = [thing.name for thing in things] +def get_valid_point_ids(thing_type="water well"): + with session_ctx() as session: + things = get_valid_things(session, thing_type) + valid_pointids = [thing.name for thing in things] return valid_pointids @@ -243,9 +263,10 @@ def data_path(r): return root / name -def filter_non_transferred_wells(sess: Session, df: pd.DataFrame) -> pd.DataFrame: - sql = select(Thing.name).where(Thing.thing_type == "water well") - existing_ids = sess.execute(sql).scalars().all() +def filter_non_transferred_wells(df: pd.DataFrame) -> pd.DataFrame: + with session_ctx() as sess: + sql = select(Thing.name).where(Thing.thing_type == "water well") + existing_ids = sess.execute(sql).scalars().all() return df[~(df["PointID"].isin(existing_ids))] @@ -265,7 +286,7 @@ def filter_by_welldata_datasource_and_project(df: pd.DataFrame) -> pd.DataFrame: counts = df.groupby("DataSource").size().reset_index(name="WellCount") counts = counts.sort_values("WellCount", ascending=False) for count in counts.itertuples(): - logger.info(f"{count.DataSource}: {count.WellCount}") + logger.info(f"{count.WellCount}: {count.DataSource[:50]} ") pldf = read_csv("ProjectLocations") collabnet = pldf[pldf["ProjectName"] == "Water Level Network"] @@ -288,8 +309,8 @@ def filter_by_valid_measuring_agency(df: pd.DataFrame) -> pd.DataFrame: return df[df["MeasuringAgency"].isin(valid_measuring_agencies)] -def filter_to_valid_point_ids(session: Session, df: pd.DataFrame) -> pd.DataFrame: - valid_point_ids = get_valid_point_ids(session) +def filter_to_valid_point_ids(df: pd.DataFrame) -> pd.DataFrame: + valid_point_ids = get_valid_point_ids() return df[df["PointID"].isin(valid_point_ids)] diff --git a/transfers/well_transfer.py b/transfers/well_transfer.py index cc049876d..45a867a72 100644 --- a/transfers/well_transfer.py +++ b/transfers/well_transfer.py @@ -117,9 +117,7 @@ def _extract_casing_materials(row) -> list[str]: return materials -def get_wells_to_transfer( - sess: Session, flags: dict = None -) -> tuple[pd.DataFrame, pd.DataFrame]: +def get_wells_to_transfer(flags: dict = None) -> tuple[pd.DataFrame, pd.DataFrame]: # if flags is None: # flags = {} @@ -145,7 +143,7 @@ def get_wells_to_transfer( # cleaned_df = wdf cleaned_df = filter_by_welldata_datasource_and_project(wdf) - cleaned_df = filter_non_transferred_wells(sess, cleaned_df) + cleaned_df = filter_non_transferred_wells(cleaned_df) return input_df, cleaned_df @@ -176,23 +174,16 @@ def __init__(self, *args, **kw): self._cached_elevations = get_cached_elevations() self._added_locations = {} - def _get_dfs(self, session: Session): - return get_wells_to_transfer(session, self.flags) + def _get_dfs(self): + return get_wells_to_transfer(self.flags) - def _iterator(self, session, df, i, row): + def _step(self, session: Session, df: pd.DataFrame, i: int, row: pd.Series): pointid = row.PointID if df[df["PointID"] == pointid].shape[0] > 1: logger.critical( f"transfer_wells. PointID {pointid} has duplicate records. Skipping." ) - self.errors.append( - { - "pointid": pointid, - "error": "duplicate records", - "table": self.source_table, - "field": "PointID", - } - ) + self._capture_error(pointid, "duplicate records", "PointID") return location = None @@ -203,16 +194,8 @@ def _iterator(self, session, df, i, row): except Exception as e: if location is not None: session.expunge(location) - # these rollbacks are cause an issue because they are discarding good data - # session.rollback() - self.errors.append( - { - "pointid": row.PointID, - "error": e, - "table": "Location", - "field": str(e), - } - ) + + self._capture_error(row.PointID, str(e), str(e), "Location") logger.critical(f"Error making location for {row.PointID}: {e}") return @@ -249,9 +232,7 @@ def _iterator(self, session, df, i, row): CreateWell.model_validate(data) except ValidationError as e: - self.errors.append( - {"pointid": row.PointID, "error": e, "table": "WellData"} - ) + self._capture_error(row.PointID, str(e), "UnknownField") logger.critical( f"Validation error for row {i} with PointID {row.PointID}: {e.errors()}" ) @@ -310,9 +291,8 @@ def _iterator(self, session, df, i, row): if well is not None: session.expunge(well) - self.errors.append( - {"pointid": row.PointID, "error": e, "table": "WellData"} - ) + self._capture_error(row.PointID, str(e), "UnknownField") + logger.critical(f"Error creating well for {row.PointID}: {e}") return @@ -418,11 +398,22 @@ def _after_hook(self, session): session.commit() -class WellScreenTransferer(ChunkTransferer): - def _get_dfs(self, session: Session): - input_df = read_csv("WellScreens") +class WellChunkTransferer(ChunkTransferer): + source_table: str = None + source_dtypes: dict = None + + def __init__(self, *args, **kw): + super().__init__(*args, **kw) + if self.source_table is None: + raise ValueError("source_table must be set") + + def _get_dfs(self): + if self.source_table is None: + raise ValueError("source_table must be set") + + input_df = read_csv(self.source_table, self.source_dtypes) wdf = replace_nans(input_df) - cleaned_df = filter_to_valid_point_ids(session, wdf) + cleaned_df = filter_to_valid_point_ids(wdf) return input_df, cleaned_df def _get_df_chunk(self, session, chunk): @@ -437,7 +428,11 @@ def _get_db_item(self, dbchunk, row): def _missing_db_item_warning(self, row): logger.warning(f"Thing with PointID {row.PointID} not found in database.") - def _chunk_iterator(self, session, df, i, row, db_item): + +class WellScreenTransferer(WellChunkTransferer): + source_table = "WellScreens" + + def _chunk_step(self, session, df, i, row, db_item): well_screen_data = { "thing_id": db_item.id, "screen_depth_top": row.ScreenTop, @@ -454,9 +449,7 @@ def _chunk_iterator(self, session, df, i, row, db_item): logger.critical( f"Validation error for row {i} with PointID {row.PointID}: {e.errors()}" ) - self.errors.append( - {"pointid": row.PointID, "error": e, "table": "WellScreens"} - ) + self._capture_error(row.PointID, str(e), "UnknownField") return well_screen = WellScreen(**well_screen_data) From 078493c3763aa8361fc61cf2fb6176432040018c Mon Sep 17 00:00:00 2001 From: jakeross Date: Fri, 28 Nov 2025 20:39:59 -0700 Subject: [PATCH 23/66] refactor: replace transfer_water_levels function with WaterLevelTransferer class for improved data handling and transfer process --- transfers/link_ids_transfer.py | 2 +- transfers/transfer.py | 12 +- transfers/waterlevels_transfer.py | 541 +++++++++++++----------------- 3 files changed, 242 insertions(+), 313 deletions(-) diff --git a/transfers/link_ids_transfer.py b/transfers/link_ids_transfer.py index dbb33f76f..c32fd0b8d 100644 --- a/transfers/link_ids_transfer.py +++ b/transfers/link_ids_transfer.py @@ -47,7 +47,7 @@ def _chunk_step(self, session, dr, i, row, db_item): if pd.isna(aid): # logger.warning(f"{klass} is null for {row.PointID}") continue - print("aid", aid, type(aid)) + # RULE: exclude any id that == 'X', '?' if aid.strip().lower() in ("x", "?", "exempt"): logger.critical( diff --git a/transfers/transfer.py b/transfers/transfer.py index 97086d10b..5cfc6e63d 100644 --- a/transfers/transfer.py +++ b/transfers/transfer.py @@ -33,7 +33,7 @@ ) from transfers.contact_transfer import transfer_contacts from transfers.sensor_transfer import SensorTransferer -from transfers.waterlevels_transfer import transfer_water_levels +from transfers.waterlevels_transfer import WaterLevelTransferer from transfers.well_transfer import WellTransferer, WellScreenTransferer from transfers.asset_transfer import AssetTransferer @@ -92,8 +92,8 @@ def transfer_all(sess, metrics, limit=100): metrics.contact_metrics(sess, *results) message("TRANSFERRING WATER LEVELS") - results = timeit_direct(transfer_water_levels, sess) - metrics.water_level_metrics(sess, *results) + results = _execute_transfer(WaterLevelTransferer, flags=flags) + metrics.water_level_metrics(*results) message("TRANSFERRING WATER LEVELS PRESSURE") results = timeit_direct(transfer_water_levels_pressure, sess) @@ -172,9 +172,9 @@ def transfer_debugging(metrics, limit=100): # results = timeit_direct(transfer_contacts, sess) # metrics.contact_metrics(sess, *results) # - # message("TRANSFERRING WATER LEVELS") - # results = timeit_direct(transfer_water_levels, sess) - # metrics.water_level_metrics(sess, *results) + message("TRANSFERRING WATER LEVELS") + results = _execute_transfer(WaterLevelTransferer, flags=flags) + metrics.water_level_metrics(*results) # # message("TRANSFERRING WATER LEVELS PRESSURE") # results = timeit_direct(transfer_water_levels_pressure, sess) diff --git a/transfers/waterlevels_transfer.py b/transfers/waterlevels_transfer.py index a1bb32717..80b8a4bd8 100644 --- a/transfers/waterlevels_transfer.py +++ b/transfers/waterlevels_transfer.py @@ -14,11 +14,11 @@ # limitations under the License. # =============================================================================== import json -import time import uuid from datetime import datetime import pandas as pd +from sqlalchemy.orm import Session from db import ( Thing, @@ -30,6 +30,8 @@ FieldEventParticipant, Parameter, ) +from db.engine import session_ctx +from transfers.transferer import Transferer from transfers.util import ( filter_to_valid_point_ids, logger, @@ -46,348 +48,275 @@ SPACE_6 = " " * 6 -def get_dt_utc(row, errors): - if pd.isna(row.DateMeasured): - logger.critical( - f"transfer_water_levels. Skipping row PointID={row.PointID}, objectid={row.OBJECTID} because there is no DateMeasured" - ) - errors.append( - { - "pointid": row.PointID, - "error": "no DateMeasured", - "table": "WaterLevels", - "field": "DateMeasured", - } - ) - return - - if pd.isna(row.TimeMeasured): - fmt = "%Y-%m-%d" - dt_measured = row.DateMeasured - else: - fmt = "%Y-%m-%d %H:%M:%S.%f" - t = row.TimeMeasured - # Truncate microseconds to 6 digits if present - if "." in t: - t = t[:-6] - - dt_measured = f"{row.DateMeasured} {t}" - - try: - dt = datetime.strptime(dt_measured, fmt) - return convert_mt_to_utc(dt) - except ValueError as e: - errors.append( - { - "pointid": row.PointID, - "error": str(e), - "table": "WaterLevels", - "field": "DateMeasured", - } - ) - logger.critical( - f"transfer_water_levels. Skipping row PointID={row.PointID}, objectid={row.OBJECTID} due to " - f"invalid date/time: {e}" - ) - return None - - -def get_contacts_info(row, measured_by, measured_by_mapper): - # measuring_agency = ( - # "Unknown" if pd.isna(row.MeasuringAgency) else row.MeasuringAgency - # ) - - # ns --> names - # os --> organizations - # rs --> roles +def get_contacts_info( + row, measured_by, measured_by_mapper +) -> list[tuple[str, str, str]]: # TODO: get help figuring out (AMP) if measured_by in measured_by_mapper: args = measured_by_mapper[measured_by] if isinstance(args[0], list): - ns, os, rs = zip(*args) + names, orgs, roles = zip(*args) else: - ns = [args[0]] - os = [args[1]] - rs = [args[2]] + names, orgs, roles = [args[0]], [args[1]], [args[2]] + else: - ns = [measured_by] - os = ["Unknown"] - rs = ["Unknown"] + names = [measured_by] + orgs = ["Unknown"] + roles = ["Unknown"] logger.warning( f"{SPACE_6}The following record has not been mapped to a Contact: MeasuredBy {row.MeasuredBy} | MeasuringAgency {row.MeasuringAgency} for WaterLevels record with GLobalID {row.GlobalID}" ) - return ns, os, rs - - -def transfer_water_levels(session): - groundwater_parameter_id = ( - session.query(Parameter) - .filter(Parameter.parameter_name == "groundwater level") - .one() - .id - ) - - # keep a dictionary of created Contacts to avoid repeated SQL queries - # keys are a tuple of (name, organization) since None is a common "name" - created_contacts = {} - path = get_transfers_data_path("measured_by_mapper.json") - - with open(path, "r") as f: - measured_by_mapper = json.load(f) - source_table = "WaterLevels" - input_df = read_csv(source_table) - cleaned_df = filter_to_valid_point_ids(session, input_df) - cleaned_df = filter_by_valid_measuring_agency(cleaned_df) - - gwd = cleaned_df.groupby(["PointID"]) - - start_time = time.time() - errors = [] - - # TODO: this needs to be cleaned up - # the for loop is too long and hard to read - # adding contacts should be done in a separate function - for index, group in gwd: - pointid = index[0] - logger.info(f"Processing PointID: {pointid}") - thing = session.query(Thing).where(Thing.name == pointid).first() - if thing is None: - logger.critical( - f"Thing with PointID={pointid} not found. Skipping water levels" - ) - errors.append( - { - "pointid": pointid, - "error": "Thing with PointID not found", - "table": source_table, - "field": "PointID", - } - ) - continue - - n = len(group) - for i, row in enumerate(group.itertuples()): - if i and not i % 25: - logger.info( - f"Processing row {i} of {n}. {row.PointID}, avg rows per second: {i / (time.time() - start_time):.2f}" - ) - session.commit() - - dt_utc = get_dt_utc(row, errors) - if dt_utc is None: - continue - - release_status = "public" if row.PublicRelease else "private" - - measured_by = None if pd.isna(row.MeasuredBy) else row.MeasuredBy - - """ - Developer's notes + return zip(names, orgs, roles) - Use existing contact for the thing if measured by is the owner. - If no contacts can be made or retrieved for the field event skip - it altogether and note in the log file. There must be at least one - contact associated with an event - """ - field_event_participants = [] - if measured_by not in ["Owner", "Owner report", "Well owner"]: - # --- Contact/FieldEventParticipant --- - contact_info = get_contacts_info(row, measured_by, measured_by_mapper) - - for name, organization, role in zip(*contact_info): - if (name, organization) in created_contacts: - contact = created_contacts[(name, organization)] - else: - try: - # create new contact if not already created - contact = Contact( - name=name, - role=role, - contact_type="Field Event Participant", - organization=organization, - nma_pk_waterlevels=row.GlobalID, - ) - session.add(contact) - # session.flush() # to get the contact.id - - logger.info( - f"{SPACE_2}Created contact: | Name {contact.name} | Role {contact.role} | Organization {contact.organization} | nma_pk_waterlevels {contact.nma_pk_waterlevels}" - ) - - created_contacts[(name, organization)] = contact - except Exception as e: - logger.critical( - f"Contact cannot be created: Name {name} | Role {role} | Organization {organization} because of the following: {str(e)}" - ) - continue - - field_event_participants.append(contact) - else: - contact = thing.contacts[0] - field_event_participants.append(contact) - - if len(field_event_participants) == 0: - logger.critical( - f"No contacts can be associated with the WaterLevels record with GlobalID {row.GlobalID}, therefore no field event, field activity, sample, and observation can be made. Skipping." +class WaterLevelTransferer(Transferer): + def __init__(self, *args, **kw): + super().__init__(*args, **kw) + self.source_table = "WaterLevels" + with session_ctx() as session: + groundwater_parameter_id = ( + session.query(Parameter) + .filter(Parameter.parameter_name == "groundwater level") + .one() + .id + ) + self.groundwater_parameter_id = groundwater_parameter_id + + path = get_transfers_data_path("measured_by_mapper.json") + with open(path, "r") as f: + self._measured_by_mapper = json.load(f) + + self._created_contacts = {} + + def _get_dfs(self) -> tuple[pd.DataFrame, pd.DataFrame]: + input_df = read_csv(self.source_table) + cleaned_df = filter_to_valid_point_ids(input_df) + cleaned_df = filter_by_valid_measuring_agency(cleaned_df) + return input_df, cleaned_df + + def _transfer_hook(self, session: Session) -> None: + gwd = self.cleaned_df.groupby(["PointID"]) + for index, group in gwd: + pointid = index[0] + thing = session.query(Thing).where(Thing.name == pointid).first() + + for i, row in enumerate(group.itertuples()): + dt_utc = self._get_dt_utc(row) + if dt_utc is None: + continue + + release_status = "public" if row.PublicRelease else "private" + + # field event + field_event = FieldEvent( + thing=thing, + event_date=dt_utc, + release_status=release_status, ) - continue - - """ - Developer's notes + session.add(field_event) + field_event_participants = self._get_field_event_participants( + session, row, thing + ) + sampler = None + for i, participant in enumerate(field_event_participants): + field_event_participant = FieldEventParticipant( + field_event=field_event, participant=participant + ) + if i == 0: + field_event_participant.participant_role = "Lead" + sampler = field_event_participant + else: + field_event_participant.participant_role = "Participant" - Assumes for manual water levels that the date/time of the water level - measurement is the same as the date/time of the field event. - """ + session.add(field_event_participant) - # --- FieldEvent --- - # TODO: use create schema to validate data - field_event = FieldEvent( - thing=thing, - event_date=dt_utc, - release_status=release_status, - ) + # reasons + glv = self._get_groundwater_level_reason(row) + if ( + glv + == "Well was destroyed (no subsequent water levels should be recorded)" + ): + logger.warning( + "Well is destroyed - no field activity/sample/observation will be made" + ) + field_event.notes = glv + continue + + # Field Activity + # TODO: use create schema to validate data + field_activity = FieldActivity( + field_event=field_event, + activity_type="groundwater level", + release_status=release_status, + ) + session.add(field_activity) - session.add(field_event) + # Sample + sample = self._make_sample(row, field_activity, dt_utc, sampler) + session.add(sample) - logger.info( - f"{SPACE_2}Created field event: ID {field_event.id} | Date {field_event.event_date} | Thing ID {field_event.thing.id} | Thing Name {field_event.thing.name}" - ) + # Observation + observation = self._make_observation(row, sample, dt_utc, glv) + session.add(observation) - """ - Developer's notes + session.commit() - Assumes that the first listed contact is the lead and the - person who took the sample. The subsequent contact will be - participants in the field event - """ - for i, participant in enumerate(field_event_participants): - field_event_participant = FieldEventParticipant( - field_event=field_event, participant=participant + def _make_observation( + self, row: pd.Series, sample: Sample, dt_utc: datetime, glv: str + ) -> Observation: + if pd.isna(row.MPHeight): + if pd.notna(row.DepthToWater) and pd.notna(row.DepthToWaterBGS): + logger.warning( + f"{SPACE_6}Calculating measuring_point_height as DepthToWater - DepthToWaterBGS because MPHeight is NULL" ) - if i == 0: - field_event_participant.participant_role = "Lead" - sampler = field_event_participant - else: - field_event_participant.participant_role = "Participant" - - session.add(field_event_participant) - logger.info( - f"{SPACE_4}Created field event contact: ID {field_event_participant.id} | Role {field_event_participant.participant_role} | Contact ID {field_event_participant.participant.id} | Contact Name {field_event_participant.participant.name} | Contact Org {field_event_participant.participant.organization}" + measuring_point_height = row.DepthToWater - row.DepthToWaterBGS + else: + logger.warning( + f"{SPACE_6}Setting measuring_point_height to None because MPHeight is NULL and DepthToWater or DepthToWaterBGS is NULL" ) + measuring_point_height = None + else: + # some mp heights are recorded as negative numbers, but they should be positive + measuring_point_height = abs(row.MPHeight) - groundwater_level_reason = ( - lexicon_mapper.map_value(f"LU_LevelStatus:{row.LevelStatus}") - if not pd.isna(row.LevelStatus) - else None - ) - groundwater_level_reason = ( - "Water level not affected" - if groundwater_level_reason == "Water level not affected by status" - else groundwater_level_reason - ) - - if ( - groundwater_level_reason - == "Well was destroyed (no subsequent water levels should be recorded)" - ): + if pd.isna(row.DepthToWater): + if pd.notna(row.DepthToWaterBGS): logger.warning( - "Well is destroyed - no field activity/sample/observation will be made" + f"{SPACE_6}Calculating observation value as DepthToWaterBGS + MPHeight (0 if MPHeight is NULL) because DepthToWater is NULL" ) - field_event.notes = groundwater_level_reason - continue - - # --- FieldActivity --- - # TODO: use create schema to validate data - field_activity = FieldActivity( - field_event=field_event, - activity_type="groundwater level", - release_status=release_status, - ) - session.add(field_activity) + value = row.DepthToWaterBGS + measuring_point_height + else: + # use None not NaN + value = None + else: + value = row.DepthToWater - logger.info( - f"{SPACE_4}Created field activity: ID {field_activity.id} | Type {field_activity.activity_type}" + # TODO: after sensors have been added to the database update sensor_id (or sensor) for waterlevels that come from db sensors (like e probes?) + observation = Observation( + nma_pk_waterlevels=row.GlobalID, + sample=sample, + sensor_id=None, + analysis_method_id=None, + observation_datetime=dt_utc, + parameter_id=self.groundwater_parameter_id, + value=value, + unit="ft", + measuring_point_height=measuring_point_height, + groundwater_level_reason=glv, + ) + return observation + + def _make_sample(self, row, field_activity, dt_utc, sampler) -> Sample: + sample_method = ( + "null placeholder" + if pd.isna(row.MeasurementMethod) + else lexicon_mapper.map_value( + f"LU_MeasurementMethod:{row.MeasurementMethod}" ) + ) + sample = Sample( + nma_pk_waterlevels=row.GlobalID, + field_activity=field_activity, + field_event_participant=sampler, + sample_date=dt_utc, + sample_matrix="water", + sample_name=str(uuid.uuid4()), + sample_method=sample_method, + qc_type="Normal", + depth_top=None, + depth_bottom=None, + ) + return sample - # --- Sample --- - sample_method = ( - "null placeholder" - if pd.isna(row.MeasurementMethod) - else lexicon_mapper.map_value( - f"LU_MeasurementMethod:{row.MeasurementMethod}" - ) - ) + def _get_groundwater_level_reason(self, row) -> str: + glv = row.LevelStatus + if pd.isna(glv): + return None - # todo: use create schema to validate data - sample = Sample( - nma_pk_waterlevels=row.GlobalID, - field_activity=field_activity, - field_event_participant=sampler, - sample_date=dt_utc, - sample_matrix="water", - sample_name=str(uuid.uuid4()), - sample_method=sample_method, - qc_type="Normal", - depth_top=None, - depth_bottom=None, - ) - session.add(sample) - logger.info( - f"{SPACE_4}Created sample: ID {sample.id} | Date {sample.sample_date} | Matrix {sample.sample_matrix} | Method {sample.sample_method}" - ) + glv = lexicon_mapper.map_value(f"LU_LevelStatus:{glv}") + if glv == "Water level not affected by status": + glv = "Water level not affected" + return glv - # TODO: use create schema to validate data + def _get_field_event_participants(self, session, row, thing) -> list[Contact]: + field_event_participants = [] + measured_by = None if pd.isna(row.MeasuredBy) else row.MeasuredBy - if pd.isna(row.MPHeight): - if not pd.isna(row.DepthToWater) and not pd.isna(row.DepthToWaterBGS): - logger.warning( - f"{SPACE_6}Calculating measuring_point_height as DepthToWater - DepthToWaterBGS because MPHeight is NULL" - ) - measuring_point_height = row.DepthToWater - row.DepthToWaterBGS - else: - logger.warning( - f"{SPACE_6}Setting measuring_point_height to None because MPHeight is NULL and DepthToWater or DepthToWaterBGS is NULL" - ) - measuring_point_height = None - else: - # some mp heights are recorded as negative numbers, but they should be positive - measuring_point_height = abs(row.MPHeight) + if measured_by not in ["Owner", "Owner report", "Well owner"]: + # --- Contact/FieldEventParticipant --- + contact_info = get_contacts_info(row, measured_by, self._measured_by_mapper) - if pd.isna(row.DepthToWater): - if not pd.isna(row.DepthToWaterBGS): - logger.warning( - f"{SPACE_6}Calculating observation value as DepthToWaterBGS + MPHeight (0 if MPHeight is NULL) because DepthToWater is NULL" - ) - value = row.DepthToWaterBGS + measuring_point_height + for name, organization, role in contact_info: + if (name, organization) in self._created_contacts: + contact = self._created_contacts[(name, organization)] else: - # use None not NaN - value = None - else: - value = row.DepthToWater + try: + # create new contact if not already created + contact = Contact( + name=name, + role=role, + contact_type="Field Event Participant", + organization=organization, + nma_pk_waterlevels=row.GlobalID, + ) + session.add(contact) + + logger.info( + f"{SPACE_2}Created contact: | Name {contact.name} | Role {contact.role} | Organization {contact.organization} | nma_pk_waterlevels {contact.nma_pk_waterlevels}" + ) + + self._created_contacts[(name, organization)] = contact + except Exception as e: + logger.critical( + f"Contact cannot be created: Name {name} | Role {role} | Organization {organization} because of the following: {str(e)}" + ) + continue - # TODO: after sensors have been added to the database update sensor_id (or sensor) for waterlevels that come from db sensors (like e probes?) - observation = Observation( - nma_pk_waterlevels=row.GlobalID, - sample=sample, - sensor_id=None, - analysis_method_id=None, - observation_datetime=dt_utc, - parameter_id=groundwater_parameter_id, - value=value, - unit="ft", - measuring_point_height=measuring_point_height, - groundwater_level_reason=groundwater_level_reason, + field_event_participants.append(contact) + else: + contact = thing.contacts[0] + field_event_participants.append(contact) + + if len(field_event_participants) == 0: + logger.critical( + f"No contacts can be associated with the WaterLevels record with GlobalID {row.GlobalID}, therefore no field event, field activity, sample, and observation can be made. Skipping." ) - session.add(observation) - logger.info( - f"{SPACE_4}Created observation: ID {observation.id} | DT {observation.observation_datetime} | Value {observation.value} | MPHeight {observation.measuring_point_height} | nma_pk_waterlevels {observation.nma_pk_waterlevels}" + return None + + return field_event_participants + + def _get_dt_utc(self, row) -> datetime | None: + if pd.isna(row.DateMeasured): + logger.critical( + f"transfer_water_levels. Skipping row PointID={row.PointID}, objectid={row.OBJECTID} because there is no DateMeasured" ) - session.commit() + self._capture_error(row.PointID, "no DateMeasured", "DateMeasured") + return None - return input_df, cleaned_df, errors + if pd.isna(row.TimeMeasured): + fmt = "%Y-%m-%d" + dt_measured = row.DateMeasured + else: + fmt = "%Y-%m-%d %H:%M:%S.%f" + t = row.TimeMeasured + # Truncate microseconds to 6 digits if present + if "." in t: + t = t[:-6] + + dt_measured = f"{row.DateMeasured} {t}" + + try: + dt = datetime.strptime(dt_measured, fmt) + return convert_mt_to_utc(dt) + except ValueError as e: + self._capture_error(row.PointID, str(e), "DateMeasured") + logger.critical( + f"transfer_water_levels. Skipping row PointID={row.PointID}, objectid={row.OBJECTID} due to " + f"invalid date/time: {e}" + ) + return None # ============= EOF ============================================= From 09c71271f9f5dd4d29716bb76e7f709f0db2392b Mon Sep 17 00:00:00 2001 From: jakeross Date: Sun, 30 Nov 2025 09:55:07 -0700 Subject: [PATCH 24/66] refactor: enhance water levels transfer process by introducing WaterLevelsContinuousPressureTransferer and WaterLevelsContinuousAcousticTransferer classes --- transfers/util.py | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/transfers/util.py b/transfers/util.py index 70e6952a5..cf290c591 100644 --- a/transfers/util.py +++ b/transfers/util.py @@ -18,6 +18,7 @@ import math import os import re +import time from datetime import datetime, timezone, timedelta, UTC from pathlib import Path @@ -28,7 +29,7 @@ from sqlalchemy import select from constants import SRID_WGS84, SRID_UTM_ZONE_13N -from db import Thing, Location, DataProvenance +from db import Thing, Location, DataProvenance, Parameter from db.engine import session_ctx from services.gcs_helper import get_storage_bucket @@ -205,10 +206,16 @@ def replace_nans(df: pd.DataFrame, default=None) -> pd.DataFrame: return df.replace({np.nan: default}) -def read_csv(name: str, dtype: dict | None = None) -> pd.DataFrame: +def read_csv(name: str, dtype: dict | None = None, *args, **kw) -> pd.DataFrame: p = get_transfers_data_path(Path("nma_csv_cache") / f"{name}.csv") if os.path.exists(p): - return pd.read_csv(p, dtype=dtype) + logger.info(f"Using cached csv: {p}") + starttime = time.time() + df = pd.read_csv(p, dtype=dtype, *args, **kw) + logger.info(f"Read csv in {time.time()-starttime:0.2f}") + return df + else: + logger.info(f"Downloading csv: {name}") bucket = get_storage_bucket() blob = bucket.blob(f"nma_csv/{name}.csv") @@ -216,10 +223,7 @@ def read_csv(name: str, dtype: dict | None = None) -> pd.DataFrame: with open(p, "wb") as f: f.write(data) - if dtype: - return pd.read_csv(io.BytesIO(data), dtype=dtype) - else: - return pd.read_csv(io.BytesIO(data)) + return pd.read_csv(io.BytesIO(data), dtype=dtype) def get_valid_point_ids(thing_type="water well"): @@ -339,6 +343,17 @@ def chunk_by_size(df, chunk_size): yield df.iloc[i : i + chunk_size] +def get_groundwater_parameter_id(): + with session_ctx() as session: + groundwater_parameter_id = ( + session.query(Parameter) + .filter(Parameter.parameter_name == "groundwater level") + .one() + .id + ) + return groundwater_parameter_id + + def make_location(row: pd.Series, elevations: dict) -> tuple: """ Returns a tuple of location data and the elevation method From 76b1d3b965672ec31e9784d83cca35811f26624f Mon Sep 17 00:00:00 2001 From: jakeross Date: Sun, 30 Nov 2025 09:57:22 -0700 Subject: [PATCH 25/66] refactor: enhance water levels transfer process by introducing WaterLevelsContinuousPressureTransferer and WaterLevelsContinuousAcousticTransferer classes --- transfers/transfer.py | 102 +++--- transfers/transferer.py | 4 + transfers/waterlevels_transducer_transfer.py | 319 ++++++++++--------- 3 files changed, 233 insertions(+), 192 deletions(-) diff --git a/transfers/transfer.py b/transfers/transfer.py index 5cfc6e63d..04d5c44c1 100644 --- a/transfers/transfer.py +++ b/transfers/transfer.py @@ -19,11 +19,12 @@ load_dotenv() -from transfers.metrics import Metrics from transfers.waterlevels_transducer_transfer import ( - transfer_water_levels_pressure, - transfer_water_levels_acoustic, + WaterLevelsContinuousPressureTransferer, + WaterLevelsContinuousAcousticTransferer, ) + +from transfers.metrics import Metrics from core.initializers import erase_and_rebuild_db from transfers.group_transfer import ProjectGroupTransferer @@ -95,13 +96,13 @@ def transfer_all(sess, metrics, limit=100): results = _execute_transfer(WaterLevelTransferer, flags=flags) metrics.water_level_metrics(*results) - message("TRANSFERRING WATER LEVELS PRESSURE") - results = timeit_direct(transfer_water_levels_pressure, sess) - metrics.pressure_metrics(sess, *results) - - message("TRANSFERRING WATER LEVELS ACOUSTIC") - results = timeit_direct(transfer_water_levels_acoustic, sess) - metrics.acoustic_metrics(sess, *results) + # message("TRANSFERRING WATER LEVELS PRESSURE") + # results = timeit_direct(transfer_water_levels_pressure, sess) + # metrics.pressure_metrics(sess, *results) + # + # message("TRANSFERRING WATER LEVELS ACOUSTIC") + # results = timeit_direct(transfer_water_levels_acoustic, sess) + # metrics.acoustic_metrics(sess, *results) """ Developer's notes @@ -147,13 +148,23 @@ def transfer_debugging(metrics, limit=100): results = _execute_transfer(WellTransferer, flags=flags) metrics.well_metrics(*results) - message("TRANSFERRING WELL SCREENS") - results = _execute_transfer(WellScreenTransferer, flags=flags) - metrics.well_screen_metrics(*results) + transfer_screens = False + transfer_sensors = True + transfer_pressure = True + transfer_acoustic = True + transfer_link_ids = False + transfer_groups = False + transfer_assets = False - message("TRANSFERRING SENSORS") - results = _execute_transfer(SensorTransferer, flags=flags) - metrics.sensor_metrics(*results) + if transfer_screens: + message("TRANSFERRING WELL SCREENS") + results = _execute_transfer(WellScreenTransferer, flags=flags) + metrics.well_screen_metrics(*results) + + if transfer_sensors: + message("TRANSFERRING SENSORS") + results = _execute_transfer(SensorTransferer, flags=flags) + metrics.sensor_metrics(*results) # Developer's notes all the metadata for these Things are not defined in the models/schemas yet' # message("TRANSFERRING SPRINGS") @@ -172,31 +183,40 @@ def transfer_debugging(metrics, limit=100): # results = timeit_direct(transfer_contacts, sess) # metrics.contact_metrics(sess, *results) # - message("TRANSFERRING WATER LEVELS") - results = _execute_transfer(WaterLevelTransferer, flags=flags) - metrics.water_level_metrics(*results) - # - # message("TRANSFERRING WATER LEVELS PRESSURE") - # results = timeit_direct(transfer_water_levels_pressure, sess) - # metrics.pressure_metrics(sess, *results) - # - # message("TRANSFERRING WATER LEVELS ACOUSTIC") - # results = timeit_direct(transfer_water_levels_acoustic, sess) - # metrics.acoustic_metrics(sess, *results) - - message("TRANSFERRING LINK IDS") - results = _execute_transfer(LinkIdsWellDataTransferer, flags=flags) - metrics.welldata_link_ids_metrics(*results) - results = _execute_transfer(LinkIdsLocationDataTransferer, flags=flags) - metrics.location_link_ids_metrics(*results) - - message("TRANSFERRING GROUPS") - results = _execute_transfer(ProjectGroupTransferer, flags=flags) - metrics.group_metrics(*results) - - message("TRANSFERRING ASSETS") - results = _execute_transfer(AssetTransferer, flags=flags) - metrics.asset_metrics(*results) + # message("TRANSFERRING WATER LEVELS") + # results = _execute_transfer(WaterLevelTransferer, flags=flags) + # metrics.water_level_metrics(*results) + + if transfer_pressure: + message("TRANSFERRING WATER LEVELS PRESSURE") + results = _execute_transfer( + WaterLevelsContinuousPressureTransferer, flags=flags + ) + metrics.pressure_metrics(*results) + + if transfer_acoustic: + message("TRANSFERRING WATER LEVELS ACOUSTIC") + results = _execute_transfer( + WaterLevelsContinuousAcousticTransferer, flags=flags + ) + metrics.acoustic_metrics(*results) + + if transfer_link_ids: + message("TRANSFERRING LINK IDS") + results = _execute_transfer(LinkIdsWellDataTransferer, flags=flags) + metrics.welldata_link_ids_metrics(*results) + results = _execute_transfer(LinkIdsLocationDataTransferer, flags=flags) + metrics.location_link_ids_metrics(*results) + + if transfer_groups: + message("TRANSFERRING GROUPS") + results = _execute_transfer(ProjectGroupTransferer, flags=flags) + metrics.group_metrics(*results) + + if transfer_assets: + message("TRANSFERRING ASSETS") + results = _execute_transfer(AssetTransferer, flags=flags) + metrics.asset_metrics(*results) def main(): diff --git a/transfers/transferer.py b/transfers/transferer.py index 8d84e1170..a8045dccb 100644 --- a/transfers/transferer.py +++ b/transfers/transferer.py @@ -167,6 +167,7 @@ def _group_iterator(self, session: Session): continue prepped_group = self._get_prepped_group(group) + self._pre_group_step(session, prepped_group, db_item) for row in prepped_group.itertuples(): try: self._group_step(session, row, db_item) @@ -181,6 +182,9 @@ def _group_iterator(self, session: Session): def _get_point_id(self, row: pd.Series, db_item: Base) -> str: return row.PointID + def _pre_group_step(self, session: Session, group: DataFrame, db_item: Base): + pass + def _group_step(self, session: Session, row: pd.Series, db_item: Base): raise NotImplementedError("Must be implemented in subclass") diff --git a/transfers/waterlevels_transducer_transfer.py b/transfers/waterlevels_transducer_transfer.py index e4ce178c0..927d8d6b8 100644 --- a/transfers/waterlevels_transducer_transfer.py +++ b/transfers/waterlevels_transducer_transfer.py @@ -13,179 +13,196 @@ # See the License for the specific language governing permissions and # limitations under the License. # =============================================================================== -from pandas import to_datetime, Timestamp + +import pandas as pd +from pandas import Timestamp from pydantic import ValidationError +from sqlalchemy.orm import Session -from db import Parameter, Thing, Deployment, Sensor +from db import Thing, Deployment, Sensor from db.transducer import TransducerObservation, TransducerObservationBlock +from schemas.transducer import CreateTransducerObservation from transfers.logger import logger -from transfers.util import read_csv, filter_to_valid_point_ids +from transfers.transferer import Transferer +from transfers.util import ( + read_csv, + filter_to_valid_point_ids, + get_groundwater_parameter_id, +) + + +class WaterLevelsContinuousTransferer(Transferer): + _partition_field: str + _sensor_type: str + + def __init__(self, *args, **kw): + super().__init__(*args, **kw) + self.groundwater_parameter_id = get_groundwater_parameter_id() + if self._sensor_type is None: + raise ValueError("_sensor_type must be set") + if self._partition_field is None: + raise ValueError("_partition_field must be set") + + def _get_dfs(self): + input_df = read_csv(self.source_table, parse_dates=["DateMeasured"]) + cleaned_df = filter_to_valid_point_ids(input_df) + cleaned_df = cleaned_df.sort_values(by=["PointID"]) + # remove rows with no date measured + cleaned_df = cleaned_df[cleaned_df.DateMeasured.notna()] + return input_df, cleaned_df + + def _transfer_hook(self, session: Session) -> None: + gwd = self.cleaned_df.groupby(["PointID"]) + n = len(gwd) + nodeployments = {} + for i, (index, group) in enumerate(gwd): + pointid = index[0] + logger.info( + f"Processing PointID: {pointid}. {i + 1}/{n} ({100*(i+1)/n:0.2f}) completed." + ) -def transfer_water_levels_acoustic(session): - source_table = "WaterLevelsContinuous_Acoustic" - wd = read_csv(source_table) - return _transfer_water_levels_continuous( - session, source_table, wd, "PublicRelease", "Acoustic Sounder" - ) + deployments = ( + session.query(Deployment) + .join(Thing) + .join(Sensor) + .where(Sensor.sensor_type == self._sensor_type) + .where(Thing.name == pointid) + .all() + ) + # sort rows by date measured + group = group.sort_values(by="DateMeasured") + field = getattr(group, self._partition_field) -def transfer_water_levels_pressure(session): - source_table = "WaterLevelsContinuous_Pressure" - wd = read_csv(source_table) - return _transfer_water_levels_continuous( - session, source_table, wd, "QCed", "Pressure Transducer" - ) + qced = group[field == 1] + notqced = group[~(field == 1)] + qced_block = TransducerObservationBlock( + parameter_id=self.groundwater_parameter_id, review_status="approved" + ) + notqced_block = TransducerObservationBlock( + parameter_id=self.groundwater_parameter_id, review_status="not reviewed" + ) -def _find_deployment(ts, deployments): - for d in deployments: - start = Timestamp(d.installation_date) - if start > ts: - break # because sorted by start - end = Timestamp(d.removal_date) if d.removal_date else Timestamp.max - if end >= ts: - return d - return None + for block, rows, release_status in ( + (qced_block, qced, "public"), + (notqced_block, notqced, "private"), + ): + block.start_datetime = rows.DateMeasured.min() + block.end_datetime = rows.DateMeasured.max() + if not deployments: + logger.critical( + f"Thing with PointID={pointid} has no deployments. Skipping water levels {release_status} block" + ) + self._capture_error(pointid, "no deployments", "DateMeasured") + continue -def _transfer_water_levels_continuous( - session, source_table, input_df, partition_field, sensor_type -): - from schemas.transducer import CreateTransducerObservation - - groundwater_parameter_id = ( - session.query(Parameter) - .filter(Parameter.parameter_name == "groundwater level") - .one() - .id - ) - cleaned_df = filter_to_valid_point_ids(session, input_df) - - # group by pointid - cleaned_df = cleaned_df.sort_values(by=["PointID"]) - gwd = cleaned_df.groupby(["PointID"]) - n = len(gwd) - errors = [] - nodeployments = {} - for i, (index, group) in enumerate(gwd): - pointid = index[0] - logger.info( - f"Processing PointID: {pointid}. {i + 1}/{n} ({100*(i+1)/n:0.2f}) completed." - ) - - deployments = ( - session.query(Deployment) - .join(Thing) - .join(Sensor) - .where(Sensor.sensor_type == sensor_type) - .where(Thing.name == pointid) - .all() - ) + if rows.empty: + logger.info(f"no {release_status} records for pointid {pointid}") + continue - # remove rows with no date measured - group = group[group.DateMeasured.notna()] - group["DateMeasured"] = to_datetime(group["DateMeasured"], errors="coerce") - - # sort rows by date measured - group = group.sort_values(by="DateMeasured") - field = getattr(group, partition_field) - - qced = group[field == 1] - notqced = group[~(field == 1)] - - qced_block = TransducerObservationBlock( - parameter_id=groundwater_parameter_id, review_status="approved" - ) - notqced_block = TransducerObservationBlock( - parameter_id=groundwater_parameter_id, review_status="not reviewed" - ) - - for block, rows, release_status in ( - (qced_block, qced, "public"), - (notqced_block, notqced, "private"), - ): - block.start_datetime = rows.DateMeasured.min() - block.end_datetime = rows.DateMeasured.max() - - if not deployments: - logger.critical( - f"Thing with PointID={pointid} has no deployments. Skipping water levels {release_status} block" + deps_sorted = sorted( + deployments, key=lambda d: Timestamp(d.installation_date) ) - errors.append({"pointid": pointid, "error": "no deployments"}) - continue - if rows.empty: - logger.info(f"no {release_status} records for pointid {pointid}") - continue + observations = [ + self._make_observation( + pointid, row, release_status, deps_sorted, nodeployments + ) + for row in rows.itertuples() + ] + + observations = [obs for obs in observations if obs is not None] + session.bulk_save_objects(observations) + session.add(block) + logger.info( + f"Added {len(observations)} water levels {release_status} block" + ) + try: + session.commit() + except Exception as e: + self.append({"pointid": pointid, "error": e}) + logger.critical( + f"Error committing water levels {release_status} block: {e}" + ) + session.rollback() + continue - observations = [] + # convert nodeployments to errors + for pointid, (min_date, max_date) in nodeployments.items(): + self._capture_error( + pointid, + "DateMeasured", + f"no deployment between {min_date} and {max_date}", + ) - deps_sorted = sorted( - deployments, key=lambda d: Timestamp(d.installation_date) + def _make_observation( + self, + pointid: str, + row: pd.Series, + release_status: str, + deps_sorted: list, + nodeployments: dict, + ) -> TransducerObservation | None: + deployment = _find_deployment(row.DateMeasured, deps_sorted) + + if deployment is None: + if pointid not in nodeployments: + nodeployments[pointid] = (row.DateMeasured, row.DateMeasured) + else: + min_date, max_date = nodeployments[pointid] + if row.DateMeasured < min_date: + min_date = row.DateMeasured + elif row.DateMeasured > max_date: + max_date = row.DateMeasured + nodeployments[pointid] = min_date, max_date + + logger.critical( + f"No deployment found for PointID={pointid} at {row.DateMeasured}" + ) + return None + + try: + payload = dict( + parameter_id=self.groundwater_parameter_id, + deployment_id=deployment.id, + observation_datetime=row.DateMeasured, + value=row.DepthToWaterBGS, + release_status=release_status, ) + obspayload = CreateTransducerObservation.model_validate( + payload + ).model_dump() + return TransducerObservation(**obspayload) - for row in rows.itertuples(): - deployment = _find_deployment(row.DateMeasured, deps_sorted) + except ValidationError as e: + logger.critical(f"Observation validation error: {e.errors()}") + self._capture_error(pointid, str(e), "DepthToWaterBGS") - if deployment is None: - if pointid not in nodeployments: - nodeployments[pointid] = (row.DateMeasured, row.DateMeasured) - else: - min_date, max_date = nodeployments[pointid] - if row.DateMeasured < min_date: - min_date = row.DateMeasured - elif row.DateMeasured > max_date: - max_date = row.DateMeasured - nodeployments[pointid] = min_date, max_date - logger.critical( - f"No deployment found for PointID={pointid} at {row.DateMeasured}" - ) - continue +class WaterLevelsContinuousPressureTransferer(WaterLevelsContinuousTransferer): + source_table = "WaterLevelsContinuous_Pressure" + _partition_field = "QCed" + _sensor_type = "Pressure Transducer" - try: - payload = dict( - parameter_id=groundwater_parameter_id, - deployment_id=deployment.id, - observation_datetime=row.DateMeasured, - value=row.DepthToWaterBGS, - release_status=release_status, - ) - obspayload = CreateTransducerObservation.model_validate( - payload - ).model_dump() - observations.append(TransducerObservation(**obspayload)) - except ValidationError as e: - logger.critical(f"Observation validation error: {e.errors()}") - errors.append({"pointid": pointid, "error": e.errors()}) - - session.bulk_save_objects(observations) - session.add(block) - logger.info( - f"Added {len(observations)} water levels {release_status} block" - ) - try: - session.commit() - except Exception as e: - errors.append({"pointid": pointid, "error": e}) - logger.critical( - f"Error committing water levels {release_status} block: {e}" - ) - session.rollback() - continue - - # convert nodeployments to errors - for pointid, (min_date, max_date) in nodeployments.items(): - errors.append( - { - "table": source_table, - "pointid": pointid, - "error": f"no deployment between {min_date} and {max_date}", - } - ) - - return input_df, cleaned_df, errors + +class WaterLevelsContinuousAcousticTransferer(WaterLevelsContinuousTransferer): + source_table = "WaterLevelsContinuous_Acoustic" + _partition_field = "PublicRelease" + _sensor_type = "Acoustic Sounder" + + +def _find_deployment(ts, deployments): + for d in deployments: + start = Timestamp(d.installation_date) + if start > ts: + break # because sorted by start + end = Timestamp(d.removal_date) if d.removal_date else Timestamp.max + if end >= ts: + return d + return None # ============= EOF ============================================= From b6e5039ae69942b69fd612b1dc34385b54035127 Mon Sep 17 00:00:00 2001 From: jakeross Date: Sun, 30 Nov 2025 10:30:07 -0700 Subject: [PATCH 26/66] refactor: simplify transfer_all function by removing unnecessary parameters and restructuring transfer logic --- transfers/transfer.py | 120 ++++++++---------------------------------- 1 file changed, 22 insertions(+), 98 deletions(-) diff --git a/transfers/transfer.py b/transfers/transfer.py index 04d5c44c1..5d167d7f5 100644 --- a/transfers/transfer.py +++ b/transfers/transfer.py @@ -17,6 +17,8 @@ from dotenv import load_dotenv +from db.engine import session_ctx + load_dotenv() from transfers.waterlevels_transducer_transfer import ( @@ -50,94 +52,8 @@ def message(msg, pad=10, new_line_at_top=True): @timeit -def transfer_all(sess, metrics, limit=100): +def transfer_all(metrics, limit=100): message("STARTING TRANSFER", new_line_at_top=False) - - logger.info("Erase and rebuilding database") - erase_and_rebuild_db() - - message("TRANSFERRING WELLS") - - flags = { - "TRANSFER_ALL_WELLS": True, - "TRANSFER_ALL_WELLSCREENS": True, - "LIMIT": limit, - } - - results = _execute_transfer(WellTransferer, flags=flags) - metrics.well_metrics(sess, *results) - - message("TRANSFERRING WELL SCREENS") - results = _execute_transfer(WellScreenTransferer, flags=flags) - metrics.well_screen_metrics(sess, *results) - - message("TRANSFERRING SENSORS") - results = _execute_transfer(SensorTransferer, flags=flags) - metrics.sensor_metrics(sess, *results) - - # Developer's notes all the metadata for these Things are not defined in the models/schemas yet' - # message("TRANSFERRING SPRINGS") - # timeit_direct(transfer_springs, sess, limit=limit) - # - # message("TRANSFERRING PERENNIAL STREAMS") - # timeit_direct(transfer_perennial_stream, sess, limit=limit) - # - # message("TRANSFERRING EPHEMERAL STREAMS") - # timeit_direct(transfer_ephemeral_stream, sess, limit=limit) - # - # message("TRANSFERRING METEOROLOGICAL") - # timeit_direct(transfer_met, sess, limit) - - message("TRANSFERRING CONTACTS") - results = timeit_direct(transfer_contacts, sess) - metrics.contact_metrics(sess, *results) - - message("TRANSFERRING WATER LEVELS") - results = _execute_transfer(WaterLevelTransferer, flags=flags) - metrics.water_level_metrics(*results) - - # message("TRANSFERRING WATER LEVELS PRESSURE") - # results = timeit_direct(transfer_water_levels_pressure, sess) - # metrics.pressure_metrics(sess, *results) - # - # message("TRANSFERRING WATER LEVELS ACOUSTIC") - # results = timeit_direct(transfer_water_levels_acoustic, sess) - # metrics.acoustic_metrics(sess, *results) - - """ - Developer's notes - - When transfering water chemistry data use the qc_type field to indicate - normal/blanks/duplicates instead of what comes from LU_SampleType. Use - those values, however, to map to the standard qc_type fields if applicable - (i.e. not applicable when sample type is "Soil or rock sample" or - "Precipitation," but is applicable when sample type is "Equipment blank" - or "Field duplicate") - """ - message("TRANSFERRING LINK IDS") - results = _execute_transfer(LinkIdsWellDataTransferer, flags=flags) - metrics.welldata_link_ids_metrics(*results) - results = _execute_transfer(LinkIdsLocationDataTransferer, flags=flags) - metrics.location_link_ids_metrics(*results) - - message("TRANSFERRING GROUPS") - results = _execute_transfer(ProjectGroupTransferer, flags=flags) - metrics.group_metrics(*results) - - message("TRANSFERRING ASSETS") - results = _execute_transfer(AssetTransferer, flags=flags) - metrics.asset_metrics(*results) - - -def _execute_transfer(klass, flags: dict = None): - transferer = klass(flags=flags) - transferer.transfer() - return transferer.input_df, transferer.cleaned_df, transferer.errors - - -def transfer_debugging(metrics, limit=100): - message("STARTING TRANSFER DEBUG", new_line_at_top=False) - if int(os.environ.get("ERASE_AND_REBUILD", 0)): logger.info("Erase and rebuilding database") erase_and_rebuild_db() @@ -150,11 +66,13 @@ def transfer_debugging(metrics, limit=100): transfer_screens = False transfer_sensors = True + transfer_waterlevels = False transfer_pressure = True transfer_acoustic = True transfer_link_ids = False transfer_groups = False transfer_assets = False + do_transfer_contacts = False if transfer_screens: message("TRANSFERRING WELL SCREENS") @@ -179,13 +97,16 @@ def transfer_debugging(metrics, limit=100): # message("TRANSFERRING METEOROLOGICAL") # timeit_direct(transfer_met, sess, limit) - # message("TRANSFERRING CONTACTS") - # results = timeit_direct(transfer_contacts, sess) - # metrics.contact_metrics(sess, *results) - # - # message("TRANSFERRING WATER LEVELS") - # results = _execute_transfer(WaterLevelTransferer, flags=flags) - # metrics.water_level_metrics(*results) + if do_transfer_contacts: + message("TRANSFERRING CONTACTS") + with session_ctx() as sess: + results = timeit_direct(transfer_contacts, sess) + metrics.contact_metrics(sess, *results) + + if transfer_waterlevels: + message("TRANSFERRING WATER LEVELS") + results = _execute_transfer(WaterLevelTransferer, flags=flags) + metrics.water_level_metrics(*results) if transfer_pressure: message("TRANSFERRING WATER LEVELS PRESSURE") @@ -219,15 +140,18 @@ def transfer_debugging(metrics, limit=100): metrics.asset_metrics(*results) +def _execute_transfer(klass, flags: dict = None): + transferer = klass(flags=flags) + transferer.transfer() + return transferer.input_df, transferer.cleaned_df, transferer.errors + + def main(): message("START--------------------------------------") limit = int(os.getenv("TRANSFER_LIMIT", 1000)) metrics = Metrics() - if int(os.getenv("TRANSFER_DEBUG", 0)): - transfer_debugging(metrics, limit=limit) - else: - transfer_all(metrics, limit=limit) + transfer_all(metrics, limit=limit) metrics.close() metrics.save_to_storage_bucket() From c77411d504760d87bc5e4901ea6d24c539249b85 Mon Sep 17 00:00:00 2001 From: jakeross Date: Sun, 30 Nov 2025 16:42:19 -0700 Subject: [PATCH 27/66] refactor: implement ContactTransfer class for improved contact data handling and transfer process --- services/query_helper.py | 13 +-- services/util.py | 22 ++++- transfers/contact_transfer.py | 148 +++++++++++----------------------- transfers/transfer.py | 31 ++++--- 4 files changed, 82 insertions(+), 132 deletions(-) diff --git a/services/query_helper.py b/services/query_helper.py index 3f0e3dd24..970ad1720 100644 --- a/services/query_helper.py +++ b/services/query_helper.py @@ -25,18 +25,7 @@ from db import search as search_func from services.regex import QUERY_REGEX - - -def to_bool(value: str) -> bool | str: - """Convert a string to a boolean.""" - if isinstance(value, bool): - return value - if value.lower() in ("true", "1", "yes"): - return True - elif value.lower() in ("false", "0", "no"): - return False - - return value +from services.util import to_bool def make_where(col: Column, op: str, v: str) -> OperatorExpression: diff --git a/services/util.py b/services/util.py index 77cd5d5cd..313a922ec 100644 --- a/services/util.py +++ b/services/util.py @@ -1,17 +1,33 @@ import json +import os -from shapely.ops import transform -import pyproj import httpx +import pyproj +from shapely.ops import transform from sqlalchemy.orm import DeclarativeBase from constants import SRID_WGS84 - TRANSFORMERS = {} METERS_TO_FEET = 3.28084 +def to_bool(value: str) -> bool | str: + """Convert a string to a boolean.""" + if isinstance(value, bool): + return value + if value.lower() in ("true", "1", "yes"): + return True + elif value.lower() in ("false", "0", "no"): + return False + + return value + + +def get_bool_env(key, default=False): + return to_bool(os.getenv(key, default)) + + def transform_srid(geometry, source_srid, target_srid): """ geometry must be a shapely geometry object, like Point, Polygon, or MultiPolygon diff --git a/transfers/contact_transfer.py b/transfers/contact_transfer.py index c9b1c9fb0..a1d545a03 100644 --- a/transfers/contact_transfer.py +++ b/transfers/contact_transfer.py @@ -15,140 +15,79 @@ # =============================================================================== import json +import pandas as pd +from pandas import DataFrame from pydantic import ValidationError +from sqlalchemy.orm import Session from db import ( - Thing, Contact, ThingContactAssociation, Email, Phone, Address, IncompleteNMAPhone, + Base, ) from transfers.logger import logger +from transfers.transferer import ThingBasedTransferer from transfers.util import ( get_transfers_data_path, - chunk_by_size, ) from transfers.util import read_csv, filter_to_valid_point_ids, replace_nans -def extract_owner_role(comment): - # if comment is None: - # return "Owner" - # if "Owner" in comment: - # return "Owner" - # if "Manager" in comment: - # return "Manager" - # if "Director" in comment: - # return "Director" +class ContactTransfer(ThingBasedTransferer): + source_table = "OwnersData" - return "Owner" + def __init__(self, *args, **kw): + super().__init__(*args, **kw) + co_to_org_mapper_path = get_transfers_data_path( + "owners_organization_mapper.json" + ) + with open(co_to_org_mapper_path, "r") as f: + self._co_to_org_mapper = json.load(f) + self._added = [] -""" -Developer's notes + def _get_dfs(self): + input_df = read_csv(self.source_table) + odf = input_df.drop(["OBJECTID", "GlobalID"], axis=1) + ldf = read_csv("OwnerLink") + ldf = ldf.drop(["OBJECTID", "GlobalID"], axis=1) + locdf = read_csv("Location") + ldf = ldf.join(locdf.set_index("LocationId"), on="LocationId") -Use Pydantic to perform model validations since all restrictions will -be built into the models -""" + odf = odf.join(ldf.set_index("OwnerKey"), on="OwnerKey") + odf = replace_nans(odf) -def transfer_contacts(session): + odf = filter_to_valid_point_ids(odf) + return input_df, odf - co_to_org_mapper_path = get_transfers_data_path("owners_organization_mapper.json") - with open(co_to_org_mapper_path, "r") as f: - co_to_org_mapper = json.load(f) - - source_table = "OwnersData" - input_df = read_csv(source_table) - odf = input_df.drop(["OBJECTID", "GlobalID"], axis=1) - ldf = read_csv("OwnerLink") - ldf = ldf.drop(["OBJECTID", "GlobalID"], axis=1) - locdf = read_csv("Location") - ldf = ldf.join(locdf.set_index("LocationId"), on="LocationId") - - odf = odf.join(ldf.set_index("OwnerKey"), on="OwnerKey") - - odf = replace_nans(odf) - - odf = filter_to_valid_point_ids(session, odf) - cleaned_df = odf - errors = [] - added = [] - odf = odf.sort_values(by=["PointID"]) - - for chunk in chunk_by_size(odf, 100): - pointids = chunk.PointID.tolist() - logger.info(f"Processing chunk {pointids[0]} to {pointids[-1]}") - things = session.query(Thing).filter(Thing.name.in_(pointids)).all() - for i, row in chunk.iterrows(): - thing = next((thing for thing in things if thing.name == row.PointID), None) - logger.info(f"Processing PointID: {i} {row.PointID}") - if thing is None: - logger.critical( - f"Thing with PointID {row.PointID} not found. Skipping owner." - ) - continue - - # TODO: use contact_helper.add_contact - try: - if _add_first_contact(session, row, thing, co_to_org_mapper, added): - session.commit() - # session.flush() - logger.info(f"added first contact for PointID {row.PointID}") - except ValidationError as e: - logger.critical( - f"Skipping first contact for PointID {row.PointID} due to validation error: {e.errors()}" - ) - # session.rollback() - errors.append( - {"pointid": row.PointID, "error": e, "table": source_table} - ) - except Exception as e: - logger.critical( - f"Skipping first contact for PointID {row.PointID} due to error: {e}" - ) - session.rollback() - errors.append( - {"pointid": row.PointID, "error": e, "table": source_table} - ) + def _get_prepped_group(self, group) -> DataFrame: + return group.sort_values(by=["PointID"]) + def _group_step(self, session: Session, row: pd.Series, db_item: Base): + for adder, tag in (_add_first_contact, "first"), ( + _add_second_contact, + "second", + ): try: - if ( - row.SecondFirstName is None - and row.SecondLastName is None - and row.SecondCtctEmail is None - and row.SecondCtctPhone is None - ): - logger.warning( - f"No second contact info for PointID {row.PointID}, skipping." - ) - continue - if _add_second_contact(session, row, thing, co_to_org_mapper, added): + if adder(session, row, db_item, self._co_to_org_mapper, self._added): session.commit() - # session.flush() - logger.info(f"added second contact for PointID {row.PointID}") - + logger.info(f"added {tag} contact for PointID {row.PointID}") except ValidationError as e: logger.critical( - f"Skipping second contact for PointID {row.PointID} due to validation error: {e.errors()}" - ) - # session.rollback() - errors.append( - {"pointid": row.PointID, "error": e, "table": source_table} + f"Skipping {tag} contact for PointID {row.PointID} due to validation error: {e.errors()}" ) + self._capture_error(row.PointID, str(e), "ValidationError") except Exception as e: logger.critical( - f"Skipping second contact for PointID {row.PointID} due to error: {e}" + f"Skipping {tag} contact for PointID {row.PointID} due to error: {e}" ) session.rollback() - errors.append( - {"pointid": row.PointID, "error": e, "table": source_table} - ) - - return input_df, cleaned_df, errors + self._capture_error(row.PointID, str(e), "UnknownError") def _add_first_contact(session, row, thing, co_to_org_mapper, added): @@ -252,6 +191,14 @@ def _add_first_contact(session, row, thing, co_to_org_mapper, added): def _add_second_contact(session, row, thing, co_to_org_mapper, added): + if all( + [ + getattr(row, f"Second{f}") is None + for f in ["FirstName", "LastName", "CtctEmail", "CtctPhone"] + ] + ): + logger.warning(f"No second contact info for PointID {row.PointID}, skipping.") + return release_status = "private" name = _make_name(row.SecondFirstName, row.SecondLastName) @@ -364,7 +311,6 @@ def _make_address(first_second, ownerkey, kind, **kw): ) -# def _make_contact_and_assoc(session, data, thing): from schemas.contact import CreateContact diff --git a/transfers/transfer.py b/transfers/transfer.py index 5d167d7f5..8a9c3bed3 100644 --- a/transfers/transfer.py +++ b/transfers/transfer.py @@ -17,7 +17,7 @@ from dotenv import load_dotenv -from db.engine import session_ctx +from services.util import get_bool_env load_dotenv() @@ -34,13 +34,13 @@ LinkIdsWellDataTransferer, LinkIdsLocationDataTransferer, ) -from transfers.contact_transfer import transfer_contacts +from transfers.contact_transfer import ContactTransfer from transfers.sensor_transfer import SensorTransferer from transfers.waterlevels_transfer import WaterLevelTransferer from transfers.well_transfer import WellTransferer, WellScreenTransferer from transfers.asset_transfer import AssetTransferer -from transfers.util import timeit, timeit_direct +from transfers.util import timeit from transfers.logger import logger, save_log_to_bucket @@ -64,15 +64,15 @@ def transfer_all(metrics, limit=100): results = _execute_transfer(WellTransferer, flags=flags) metrics.well_metrics(*results) - transfer_screens = False - transfer_sensors = True - transfer_waterlevels = False - transfer_pressure = True - transfer_acoustic = True - transfer_link_ids = False - transfer_groups = False - transfer_assets = False - do_transfer_contacts = False + transfer_screens = get_bool_env("TRANSFER_WELL_SCREENS", True) + transfer_sensors = get_bool_env("TRANSFER_SENSORS", True) + transfer_contacts = get_bool_env("TRANSFER_CONTACTS", True) + transfer_waterlevels = get_bool_env("TRANSFER_WATERLEVELS", True) + transfer_pressure = get_bool_env("TRANSFER_WATERLEVELS_PRESSURE", True) + transfer_acoustic = get_bool_env("TRANSFER_WATERLEVELS_ACOUSTIC", True) + transfer_link_ids = get_bool_env("TRANSFER_LINK_IDS", True) + transfer_groups = get_bool_env("TRANSFER_GROUPS", True) + transfer_assets = get_bool_env("TRANSFER_ASSETS", True) if transfer_screens: message("TRANSFERRING WELL SCREENS") @@ -97,11 +97,10 @@ def transfer_all(metrics, limit=100): # message("TRANSFERRING METEOROLOGICAL") # timeit_direct(transfer_met, sess, limit) - if do_transfer_contacts: + if transfer_contacts: message("TRANSFERRING CONTACTS") - with session_ctx() as sess: - results = timeit_direct(transfer_contacts, sess) - metrics.contact_metrics(sess, *results) + results = _execute_transfer(ContactTransfer, flags=flags) + metrics.contact_metrics(*results) if transfer_waterlevels: message("TRANSFERRING WATER LEVELS") From ec7965528ba897c22e6cd6a1641bd8eb381bae0a Mon Sep 17 00:00:00 2001 From: jakeross Date: Mon, 1 Dec 2025 22:56:12 -0700 Subject: [PATCH 28/66] refactor: rename filter_by_welldata_datasource_and_project to get_transferable_wells for clarity and update logic to include additional point IDs --- transfers/util.py | 67 +++++++++++++++++++++-------- transfers/well_transfer.py | 86 ++++++++++++++++++++++++-------------- 2 files changed, 104 insertions(+), 49 deletions(-) diff --git a/transfers/util.py b/transfers/util.py index cf290c591..83c968f4b 100644 --- a/transfers/util.py +++ b/transfers/util.py @@ -206,16 +206,22 @@ def replace_nans(df: pd.DataFrame, default=None) -> pd.DataFrame: return df.replace({np.nan: default}) -def read_csv(name: str, dtype: dict | None = None, *args, **kw) -> pd.DataFrame: +def read_csv( + name: str, dtype: dict | None = None, verbose=False, *args, **kw +) -> pd.DataFrame: p = get_transfers_data_path(Path("nma_csv_cache") / f"{name}.csv") if os.path.exists(p): - logger.info(f"Using cached csv: {p}") + if verbose: + logger.info(f"Using cached csv: {p}") starttime = time.time() df = pd.read_csv(p, dtype=dtype, *args, **kw) - logger.info(f"Read csv in {time.time()-starttime:0.2f}") + + if verbose: + logger.info(f"Read csv in {time.time()-starttime:0.2f}") return df else: - logger.info(f"Downloading csv: {name}") + if verbose: + logger.info(f"Downloading csv: {name}") bucket = get_storage_bucket() blob = bucket.blob(f"nma_csv/{name}.csv") @@ -274,30 +280,55 @@ def filter_non_transferred_wells(df: pd.DataFrame) -> pd.DataFrame: return df[~(df["PointID"].isin(existing_ids))] -def filter_by_welldata_datasource_and_project(df: pd.DataFrame) -> pd.DataFrame: +def get_transferable_wells( + df: pd.DataFrame, log_datasource_counts=False, log_invalid_datasources=False +) -> pd.DataFrame: path = get_transfers_data_path("valid_welldata_datasources.csv") with open(path, "r") as f: reader = csv.reader(f) _ = next(reader) valid_datasources = [row[0] for row in reader if row[1] == "Yes"] - # f.seek(0) - # invalid_datasources = [row[0] for row in reader if row[1] == "NO"] - # logger.info("Invalid WellData Datasources:") - # for vd in invalid_datasources: - # logger.info(f" {vd}") + if log_invalid_datasources: + f.seek(0) + invalid_datasources = [row[0] for row in reader if row[1] == "NO"] + logger.info("Invalid WellData Datasources:") + for vd in invalid_datasources: + logger.info(f" {vd}") - counts = df.groupby("DataSource").size().reset_index(name="WellCount") - counts = counts.sort_values("WellCount", ascending=False) - for count in counts.itertuples(): - logger.info(f"{count.WellCount}: {count.DataSource[:50]} ") + if log_datasource_counts: + counts = df.groupby("DataSource").size().reset_index(name="WellCount") + counts = counts.sort_values("WellCount", ascending=False) + for count in counts.itertuples(): + logger.info(f"{count.WellCount}: {count.DataSource[:50]} ") pldf = read_csv("ProjectLocations") collabnet = pldf[pldf["ProjectName"] == "Water Level Network"] - return df[ - df["DataSource"].isin(valid_datasources) - | df["PointID"].isin(collabnet["PointID"]) - ] + + collabnet_pointids = collabnet["PointID"].unique().tolist() + logger.info( + f"collabnet pointids: {len(collabnet_pointids)} {collabnet_pointids[:10]}" + ) + + # get all pointids that have USGS as the DataSource but also have WaterLevel measurements where datasource is + # NMBGMR + usgs_df = df[df["DataSource"] == "USGS"] + + waterlevel_df = read_csv("WaterLevels") + waterlevel_df = waterlevel_df[waterlevel_df["MeasuringAgency"] == "NMBGMR"] + + usgs_pointids = ( + usgs_df[usgs_df["PointID"].isin(waterlevel_df["PointID"])]["PointID"] + .unique() + .tolist() + ) + logger.info(f"usgs pointids: {len(usgs_pointids)} {usgs_pointids[:10]}") + + # get all the pointids from the well photos and include them + wellphotos_df = read_csv("WellPhotos") + wellphotos_pointids = wellphotos_df["PointID"].unique().tolist() + pointids = list(set(usgs_pointids + collabnet_pointids + wellphotos_pointids)) + return df[df["DataSource"].isin(valid_datasources) | df["PointID"].isin(pointids)] def filter_by_valid_measuring_agency(df: pd.DataFrame) -> pd.DataFrame: diff --git a/transfers/well_transfer.py b/transfers/well_transfer.py index 45a867a72..fa912ed18 100644 --- a/transfers/well_transfer.py +++ b/transfers/well_transfer.py @@ -51,7 +51,7 @@ read_csv, logger, replace_nans, - filter_by_welldata_datasource_and_project, + get_transferable_wells, lexicon_mapper, filter_non_transferred_wells, MeasuringPointEstimator, @@ -117,35 +117,35 @@ def _extract_casing_materials(row) -> list[str]: return materials -def get_wells_to_transfer(flags: dict = None) -> tuple[pd.DataFrame, pd.DataFrame]: - # if flags is None: - # flags = {} - - wdf = read_csv("WellData", dtype={"OSEWelltagID": str}) - ldf = read_csv("Location") - ldf = ldf.drop(["PointID", "SSMA_TimeStamp"], axis=1) - wdf = wdf.join(ldf.set_index("LocationId"), on="LocationId") - wdf = wdf[wdf["SiteType"] == "GW"] - wdf = wdf[wdf["Easting"].notna() & wdf["Northing"].notna()] - - input_df = wdf - wdf = replace_nans(wdf) - - # if flags.get("TRANSFER_ALL_WELLS", False): - # # todo: filter Locations by DataSource - # cleaned_df = filter_by_welldata_datasource_and_project(wdf) - # else: - # # get a subset of wells that have not been transferred yet - # # todo: this needs to be defined. - # # for now, we are just filtering out wells that have not been transferred yet - # # In the future we will be using criteria to determine which wells to transfer - # # for example, wells in the "Water Level Network" project - # cleaned_df = wdf - - cleaned_df = filter_by_welldata_datasource_and_project(wdf) - cleaned_df = filter_non_transferred_wells(cleaned_df) - - return input_df, cleaned_df +# def get_wells_to_transfer(flags: dict = None) -> tuple[pd.DataFrame, pd.DataFrame]: +# # if flags is None: +# # flags = {} +# +# wdf = read_csv("WellData", dtype={"OSEWelltagID": str}) +# ldf = read_csv("Location") +# ldf = ldf.drop(["PointID", "SSMA_TimeStamp"], axis=1) +# wdf = wdf.join(ldf.set_index("LocationId"), on="LocationId") +# wdf = wdf[wdf["SiteType"] == "GW"] +# wdf = wdf[wdf["Easting"].notna() & wdf["Northing"].notna()] +# +# input_df = wdf +# wdf = replace_nans(wdf) +# +# # if flags.get("TRANSFER_ALL_WELLS", False): +# # # todo: filter Locations by DataSource +# # cleaned_df = filter_by_welldata_datasource_and_project(wdf) +# # else: +# # # get a subset of wells that have not been transferred yet +# # # todo: this needs to be defined. +# # # for now, we are just filtering out wells that have not been transferred yet +# # # In the future we will be using criteria to determine which wells to transfer +# # # for example, wells in the "Water Level Network" project +# # cleaned_df = wdf +# +# cleaned_df = get_transferable_wells(wdf) +# cleaned_df = filter_non_transferred_wells(cleaned_df) +# +# return input_df, cleaned_df def get_cached_elevations() -> dict: @@ -175,7 +175,31 @@ def __init__(self, *args, **kw): self._added_locations = {} def _get_dfs(self): - return get_wells_to_transfer(self.flags) + wdf = read_csv("WellData", dtype={"OSEWelltagID": str}) + ldf = read_csv("Location") + ldf = ldf.drop(["PointID", "SSMA_TimeStamp"], axis=1) + wdf = wdf.join(ldf.set_index("LocationId"), on="LocationId") + wdf = wdf[wdf["SiteType"] == "GW"] + wdf = wdf[wdf["Easting"].notna() & wdf["Northing"].notna()] + + input_df = wdf + wdf = replace_nans(wdf) + + # if flags.get("TRANSFER_ALL_WELLS", False): + # # todo: filter Locations by DataSource + # cleaned_df = filter_by_welldata_datasource_and_project(wdf) + # else: + # # get a subset of wells that have not been transferred yet + # # todo: this needs to be defined. + # # for now, we are just filtering out wells that have not been transferred yet + # # In the future we will be using criteria to determine which wells to transfer + # # for example, wells in the "Water Level Network" project + # cleaned_df = wdf + + cleaned_df = get_transferable_wells(wdf) + cleaned_df = filter_non_transferred_wells(cleaned_df) + + return input_df, cleaned_df def _step(self, session: Session, df: pd.DataFrame, i: int, row: pd.Series): pointid = row.PointID From 6c08a2595d669a212ebf2da938f611bca0c4c37b Mon Sep 17 00:00:00 2001 From: jakeross Date: Tue, 2 Dec 2025 09:33:47 -0700 Subject: [PATCH 29/66] refactor: add type hints to functions in util.py for improved code clarity and maintainability --- transfers/util.py | 44 +++++++++++++++++++++++--------------------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/transfers/util.py b/transfers/util.py index 83c968f4b..d459ee4ff 100644 --- a/transfers/util.py +++ b/transfers/util.py @@ -27,13 +27,12 @@ import pytz from shapely import Point from sqlalchemy import select +from sqlalchemy.orm import Session from constants import SRID_WGS84, SRID_UTM_ZONE_13N from db import Thing, Location, DataProvenance, Parameter from db.engine import session_ctx from services.gcs_helper import get_storage_bucket - -# from services.lexicon_mapper import lexicon_mapper from services.util import ( transform_srid, get_epqs_elevation_from_point, @@ -232,14 +231,14 @@ def read_csv( return pd.read_csv(io.BytesIO(data), dtype=dtype) -def get_valid_point_ids(thing_type="water well"): +def get_valid_point_ids(thing_type: str = "water well") -> list[str]: with session_ctx() as session: things = get_valid_things(session, thing_type) valid_pointids = [thing.name for thing in things] return valid_pointids -def get_valid_things(session, thing_type="water well"): +def get_valid_things(session: Session, thing_type: str = "water well") -> list[Thing]: return session.query(Thing).where(Thing.thing_type == thing_type).all() @@ -260,7 +259,7 @@ def extract_organization(alternate_id: str) -> str: return "Unknown" -def get_transfers_data_path(name): +def get_transfers_data_path(name: str) -> Path: def data_path(r): return Path(r) / "transfers" / "data" @@ -349,7 +348,7 @@ def filter_to_valid_point_ids(df: pd.DataFrame) -> pd.DataFrame: return df[df["PointID"].isin(valid_point_ids)] -def convert_mt_to_utc(dt_record: datetime): +def convert_mt_to_utc(dt_record: datetime) -> datetime: t = dt_record.time() if t.hour == 0 and t.minute == 0: # no time was measured, so just set the timezone to UTC and keep @@ -369,12 +368,12 @@ def convert_mt_to_utc(dt_record: datetime): return dt_record -def chunk_by_size(df, chunk_size): +def chunk_by_size(df: pd.DataFrame, chunk_size: int) -> pd.DataFrame: for i in range(0, len(df), chunk_size): yield df.iloc[i : i + chunk_size] -def get_groundwater_parameter_id(): +def get_groundwater_parameter_id() -> int: with session_ctx() as session: groundwater_parameter_id = ( session.query(Parameter) @@ -592,13 +591,26 @@ def wrapper(*args, **kwargs): class LexiconMapper: def __init__(self): - self._mappers = None + self._mappers: dict[str, str] = None - def map_value(self, value): + def map_value(self, value) -> str: value = value.strip() return self._make_lu_to_lexicon_mapper().get(value, value) - def _make_lu_to_lexicon_mapper(self): + def _make_lu_to_lexicon_mapper(self) -> dict[str, str]: + """ + Lookup tables intentionally skipped (kept for documentation only) + Each entry explains why the table is excluded + + "LU_AltitudeDatum": "code is the value, so no need for mapping", + "LU_CoordinateDatum": "code is the value, so no need for mapping", + "LU_FieldNoteTypes": "not being used in the transfers since there are no records", + "LU_Formations": "needs to be cleaned before it can be used", + "LU_Lithology": "needs to be cleaned before it can be used", + "LU_MeasuringAgency": "the abbreviation is what is used in the new schema", + + :return: dict + """ if self._mappers: return self._mappers @@ -624,16 +636,6 @@ def _make_lu_to_lexicon_mapper(self): "LU_Status", ] - # Lookup tables intentionally skipped (kept for documentation only) - # Each entry explains why the table is excluded - _lu_tables_skipped = { - "LU_AltitudeDatum": "code is the value, so no need for mapping", - "LU_CoordinateDatum": "code is the value, so no need for mapping", - "LU_FieldNoteTypes": "not being used in the transfers since there are no records", - "LU_Formations": "needs to be cleaned before it can be used", - "LU_Lithology": "needs to be cleaned before it can be used", - "LU_MeasuringAgency": "the abbreviation is what is used in the new schema", - } mappers = {} for lu_table in lu_tables: From 89e8994ce528d36ffaa53eac03815419bfc28db7 Mon Sep 17 00:00:00 2001 From: jross Date: Tue, 2 Dec 2025 17:24:54 -0700 Subject: [PATCH 30/66] feat: implement aquifer and geologic formation models with transfer functionality --- .pre-commit-config.yaml | 2 + core/enums.py | 10 +- core/formations.json | 0 core/lexicon.json | 53 ++- db/__init__.py | 7 +- db/aquifer_system.py | 84 ++++ db/aquifer_type.py | 58 +++ db/contact.py | 8 +- db/data_provenance.py | 12 +- db/geologic_formation.py | 82 ++++ db/notes.py | 4 +- db/permission_history.py | 96 ++++ db/status_history.py | 4 +- db/thing.py | 128 +++++- db/thing_aquifer_association.py | 51 +++ db/thing_geologic_formation_association.py | 60 +++ schemas/aquifer_system.py | 51 +++ schemas/geologic_formation.py | 88 ++++ schemas/permission_history.py | 18 + schemas/thing.py | 94 +++- schemas/validators.py | 43 ++ services/thing_helper.py | 20 + services/util.py | 17 +- tests/features/environment.py | 413 ++++++++++++------ .../steps/well-additional-information.py | 270 ++++++++++++ tests/features/steps/well-core-information.py | 2 +- tests/test_thing.py | 32 ++ transfers/aquifer_system_transfer.py | 141 ++++++ .../data/owners_organization_mapper.json | 1 + transfers/geologic_formation_transfer.py | 141 ++++++ transfers/permissions_transfer.py | 95 ++++ transfers/stratigraphy_transfer.py | 285 ++++++++++++ transfers/util.py | 13 +- transfers/well_transfer.py | 8 + 34 files changed, 2219 insertions(+), 172 deletions(-) create mode 100644 core/formations.json create mode 100644 db/aquifer_system.py create mode 100644 db/aquifer_type.py create mode 100644 db/geologic_formation.py create mode 100644 db/permission_history.py create mode 100644 db/thing_aquifer_association.py create mode 100644 db/thing_geologic_formation_association.py create mode 100644 schemas/aquifer_system.py create mode 100644 schemas/geologic_formation.py create mode 100644 schemas/permission_history.py create mode 100644 schemas/validators.py create mode 100644 tests/features/steps/well-additional-information.py create mode 100644 transfers/aquifer_system_transfer.py create mode 100644 transfers/geologic_formation_transfer.py create mode 100644 transfers/permissions_transfer.py create mode 100644 transfers/stratigraphy_transfer.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 5d74e6a6c..b4dba7bf8 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -25,6 +25,8 @@ repos: types: [python] # Specify relevant file types for your tests pass_filenames: false always_run: true + args: + - -x # - repo: https://github.com/pre-commit/mirrors-mypy # rev: v1.10.0 # Use the latest stable version or pin to your preference diff --git a/core/enums.py b/core/enums.py index 568f3f96a..91b206cab 100644 --- a/core/enums.py +++ b/core/enums.py @@ -24,7 +24,9 @@ ) CasingMaterial: type[Enum] = build_enum_from_lexicon_category("casing_material") CollectionMethod: type[Enum] = build_enum_from_lexicon_category("collection_method") -ConstructionMethod: type[Enum] = build_enum_from_lexicon_category("construction_method") +WellConstructionMethod: type[Enum] = build_enum_from_lexicon_category( + "well_construction_method" +) ContactType: type[Enum] = build_enum_from_lexicon_category("contact_type") CoordinateMethod: type[Enum] = build_enum_from_lexicon_category("coordinate_method") WellPurpose: type[Enum] = build_enum_from_lexicon_category("well_purpose") @@ -68,8 +70,14 @@ Vertical_datum: type[Enum] = build_enum_from_lexicon_category("vertical_datum") ScreenType: type[Enum] = build_enum_from_lexicon_category("screen_type") SensorType: type[Enum] = build_enum_from_lexicon_category("sensor_type") +WellPumpType: type[Enum] = build_enum_from_lexicon_category("well_pump_type") +PermissionType: type[Enum] = build_enum_from_lexicon_category("permission_type") GroupType: type[Enum] = build_enum_from_lexicon_category("group_type") MonitoringFrequency: type[Enum] = build_enum_from_lexicon_category( "monitoring_frequency" ) +AquiferType: type[Enum] = build_enum_from_lexicon_category("aquifer_type") +GeographicScale: type[Enum] = build_enum_from_lexicon_category("geographic_scale") +Lithology: type[Enum] = build_enum_from_lexicon_category("lithology") +FormationCode: type[Enum] = build_enum_from_lexicon_category("formation_code") # ============= EOF ============================================= diff --git a/core/formations.json b/core/formations.json new file mode 100644 index 000000000..e69de29bb diff --git a/core/lexicon.json b/core/lexicon.json index 9aa9b88ae..815a40d2f 100644 --- a/core/lexicon.json +++ b/core/lexicon.json @@ -2,9 +2,10 @@ {"name": "activity_type", "description": null}, {"name": "address_type", "description": null}, {"name": "analysis_method_type", "description": null}, + {"name": "aquifer_type", "description": null}, {"name": "casing_material", "description": null}, {"name": "collection_method", "description": null}, - {"name": "construction_method", "description": null}, + {"name": "well_construction_method", "description": null}, {"name": "contact_type", "description": null}, {"name": "coordinate_method", "description": null}, {"name": "country", "description": null}, @@ -18,6 +19,7 @@ {"name": "email_type", "description": null}, {"name": "participant_role", "description": null}, {"name": "geochronology", "description": null}, + {"name": "geographic_scale", "description": null}, {"name": "groundwater_level_reason", "description": null}, {"name": "group_type", "description": null}, {"name": "horizontal_datum", "description": null}, @@ -51,7 +53,11 @@ {"name": "well_purpose", "description": null}, {"name": "status_type", "description": null}, {"name": "status_value", "description": null}, - {"name": "origin_source", "description": null} + {"name": "origin_source", "description": null}, + {"name": "well_pump_type", "description": null}, + {"name": "permission_type", "description": null}, + {"name": "formation_code", "description": null}, + {"name": "lithology", "description": null} ], "terms": [ {"categories": ["review_status"], "term": "approved", "definition": "approved"}, @@ -78,15 +84,15 @@ {"categories": ["elevation_method"], "term": "Reported", "definition": "Reported"}, {"categories": ["elevation_method"], "term": "Survey-grade Global Navigation Satellite Sys, Lvl1", "definition": "Survey-grade Global Navigation Satellite Sys, Lvl1"}, {"categories": ["elevation_method"], "term": "USGS National Elevation Dataset (NED)", "definition": "USGS National Elevation Dataset (NED)"}, - {"categories": ["elevation_method", "sample_method", "coordinate_method", "well_purpose", "status", "organization", "role"], "term": "Unknown", "definition": "Unknown"}, - {"categories": ["construction_method"], "term": "Air-rotary", "definition": "Air-rotary"}, - {"categories": ["construction_method"], "term": "Bored or augered", "definition": "Bored or augered"}, - {"categories": ["construction_method"], "term": "Cable-tool", "definition": "Cable-tool"}, - {"categories": ["construction_method"], "term": "Hydraulic rotary (mud or water)", "definition": "Hydraulic rotary (mud or water)"}, - {"categories": ["construction_method"], "term": "Air percussion", "definition": "Air percussion"}, - {"categories": ["construction_method"], "term": "Reverse rotary", "definition": "Reverse rotary"}, - {"categories": ["construction_method"], "term": "Driven", "definition": "Driven"}, - {"categories": ["construction_method", "measurement_method"], "term": "Other (explain in notes)", "definition": "Other (explain in notes)"}, + {"categories": ["elevation_method", "sample_method", "coordinate_method", "well_purpose", "status", "organization", "role", "aquifer_type"], "term": "Unknown", "definition": "Unknown"}, + {"categories": ["well_construction_method"], "term": "Air-Rotary", "definition": "Air-Rotary"}, + {"categories": ["well_construction_method"], "term": "Bored or augered", "definition": "Bored or augered"}, + {"categories": ["well_construction_method"], "term": "Cable-tool", "definition": "Cable-tool"}, + {"categories": ["well_construction_method"], "term": "Hydraulic rotary (mud or water)", "definition": "Hydraulic rotary (mud or water)"}, + {"categories": ["well_construction_method"], "term": "Air percussion", "definition": "Air percussion"}, + {"categories": ["well_construction_method"], "term": "Reverse rotary", "definition": "Reverse rotary"}, + {"categories": ["well_construction_method"], "term": "Driven", "definition": "Driven"}, + {"categories": ["well_construction_method", "measurement_method"], "term": "Other (explain in notes)", "definition": "Other (explain in notes)"}, {"categories": ["coordinate_method"], "term": "Differentially corrected GPS", "definition": "Differentially corrected GPS"}, {"categories": ["coordinate_method"], "term": "Survey-grade global positioning system (SGPS)", "definition": "Survey-grade global positioning system (SGPS)"}, {"categories": ["coordinate_method"], "term": "GPS, uncorrected", "definition": "GPS, uncorrected"}, @@ -572,6 +578,7 @@ {"categories": ["organization"], "term": "Yates Petroleum Corporation", "definition": "Yates Petroleum Corporation"}, {"categories": ["organization"], "term": "Zamora Accounting Services", "definition": "Zamora Accounting Services"}, {"categories": ["organization"], "term": "PLSS", "definition": "Public Land Survey System"}, + {"categories": ["organization"], "term": "Quemado Municipal Water & SWA", "definition": "Quemado Municipal Water & SWA"}, {"categories": ["collection_method"], "term": "Altimeter", "definition": "ALtimeter"}, {"categories": ["collection_method"], "term": "Differentially corrected GPS", "definition": "Differentially corrected GPS"}, {"categories": ["collection_method"], "term": "Survey-grade GPS", "definition": "Survey-grade GPS"}, @@ -692,6 +699,21 @@ {"categories": ["monitoring_frequency"], "term": "Annual", "definition": "Location is monitored once a year."}, {"categories": ["monitoring_frequency"], "term": "Decadal", "definition": "Location is monitored once every ten years."}, {"categories": ["monitoring_frequency"], "term": "Event-based", "definition": "Location is monitored based on specific events or triggers rather than a fixed schedule."}, + {"categories": ["aquifer_type"], "term": "Artesian", "definition": "Artesian"}, + {"categories": ["aquifer_type"], "term": "Confined single aquifer", "definition": "Confined single aquifer"}, + {"categories": ["aquifer_type"], "term": "Unsaturated (dry)", "definition": "Unsaturated (dry)"}, + {"categories": ["aquifer_type"], "term": "Fractured", "definition": "Fractured"}, + {"categories": ["aquifer_type"], "term": "Confined multiple aquifers", "definition": "Confined multiple aquifers"}, + {"categories": ["aquifer_type"], "term": "Unconfined multiple aquifers", "definition": "Unconfined multiple aquifers"}, + {"categories": ["aquifer_type"], "term": "Perched aquifer", "definition": "Perched aquifer"}, + {"categories": ["aquifer_type"], "term": "Confining layer or aquitard", "definition": "Confining layer or aquitard"}, + {"categories": ["aquifer_type"], "term": "Semi-confined", "definition": "Semi-confined"}, + {"categories": ["aquifer_type"], "term": "Unconfined single aquifer", "definition": "Unconfined single aquifer"}, + {"categories": ["aquifer_type"], "term": "Mixed (confined and unconfined multiple aquifers)", "definition": "Mixed (confined and unconfined multiple aquifers)"}, + {"categories": ["geographic_scale"], "term": "Major", "definition": "Major aquifers of national significance"}, + {"categories": ["geographic_scale"], "term": "Regional", "definition": "Important aquifers serving regions"}, + {"categories": ["geographic_scale"], "term": "Local", "definition": "Smaller, locally important aquifers"}, + {"categories": ["geographic_scale"], "term": "Minor", "definition": "Limited extent or yield"}, {"categories": ["origin_source"], "term": "Reported by another agency", "definition": "Reported by another agency"}, {"categories": ["origin_source"], "term": "From driller's log or well report", "definition": "From driller's log or well report"}, {"categories": ["origin_source"], "term": "Private geologist, consultant or univ associate", "definition": "Private geologist, consultant or univ associate"}, @@ -709,6 +731,13 @@ {"categories": ["note_type"], "term": "Historical", "definition": "Historical information or context about the well or location."}, {"categories": ["note_type"], "term": "Other", "definition": "Other types of notes that do not fit into the predefined categories."}, {"categories": ["note_type"], "term": "Water", "definition": "Water bearing zone information and other info from ose reports"}, - {"categories": ["note_type"], "term": "Measuring", "definition": "Notes about measuring/visiting the well, on Access form"} + {"categories": ["note_type"], "term": "Measuring", "definition": "Notes about measuring/visiting the well, on Access form"}, + {"categories": ["well_pump_type"], "term": "Submersible", "definition": "Submersible"}, + {"categories": ["well_pump_type"], "term": "Jet", "definition": "Jet Pump"}, + {"categories": ["well_pump_type"], "term": "Line Shaft", "definition": "Line Shaft"}, + {"categories": ["well_pump_type"], "term": "Hand", "definition": "Hand Pump"}, + {"categories": ["permission_type"], "term": "Water Level Sample", "definition": "Permissions for taking water level samples"}, + {"categories": ["permission_type"], "term": "Water Chemistry Sample", "definition": "Permissions for water taking chemistry samples"}, + {"categories": ["permission_type"], "term": "Datalogger Installation", "definition": "Permissions for installing dataloggers"} ] } \ No newline at end of file diff --git a/db/__init__.py b/db/__init__.py index 5a58441f8..4a0fc8e70 100644 --- a/db/__init__.py +++ b/db/__init__.py @@ -33,7 +33,7 @@ from db.notes import * from db.observation import * from db.parameter import * -from db.permission import * +from db.permission_history import * from db.publication import * from db.regulatory_limit import * from db.sample import * @@ -43,6 +43,11 @@ from db.transducer import * from db.measuring_point_history import * from db.data_provenance import * +from db.aquifer_system import * +from db.geologic_formation import * +from db.thing_aquifer_association import * +from db.thing_geologic_formation_association import * +from db.aquifer_type import * from sqlalchemy import ( func, diff --git a/db/aquifer_system.py b/db/aquifer_system.py new file mode 100644 index 000000000..c202d77c9 --- /dev/null +++ b/db/aquifer_system.py @@ -0,0 +1,84 @@ +""" +SQLAlchemy model for the AquiferSystem table. + +This is a master reference table for aquifer systems and hydrogeologic units. +""" + +from typing import List, TYPE_CHECKING + +from sqlalchemy import Text, Index +from sqlalchemy.orm import relationship, Mapped, mapped_column +from sqlalchemy.ext.associationproxy import association_proxy, AssociationProxy +from geoalchemy2 import Geometry + +from db.base import Base, AutoBaseMixin, ReleaseMixin +from db.lexicon import lexicon_term + +from constants import SRID_WGS84 + +if TYPE_CHECKING: + from db.thing import WellScreen, ThingAquiferAssociation, Thing + from db.aquifer_type import AquiferType + + +class AquiferSystem(Base, AutoBaseMixin, ReleaseMixin): + __versioned__ = {} + + name: Mapped[str] = mapped_column( + nullable=False, + unique=True, + comment="The full, human-readable name of the aquifer system (e.g., 'Ogallala Aquifer').", + ) + description: Mapped[str] = mapped_column( + Text, + nullable=True, + comment="A detailed description of the aquifer system, its characteristics, and its significance.", + ) + # Lexicon terms were retrieved from NMAquifer's 'LU_AquiferType' table. + primary_aquifer_type: Mapped[str] = lexicon_term( + nullable=False, + comment="A controlled vocabulary field to classify the aquifer system as a whole (e.g., 'Unconfined', 'Confined', 'Perched').", + ) + geographic_scale: Mapped[str] = lexicon_term( + nullable=True, + comment="A controlled vocabulary field to classify the aquifer's geographic scale (e.g., 'Major', 'Regional', 'Local').", + ) + boundary: Mapped[Geometry] = mapped_column( + Geometry(geometry_type="MULTIPOLYGON", srid=SRID_WGS84, spatial_index=True), + nullable=True, + comment="A spatial representation of the aquifer system's boundary.", + ) + # Hierarchical relationship fields (may be implemented in future iterations) + # Example: High Plains Aquifer (parent) contains Ogallala Aquifer (child) + # parent_id = Column(Integer, ForeignKey('aquifer_system.id')) + # parent = relationship('AquiferSystem', remote_side=[id], backref='subsystems') + + # --- Relationships --- + # One-To-Many: An AquiferSystem can be associated with many wells (Things) via the ThingAquiferAssociation join table. + thing_associations: Mapped[List["ThingAquiferAssociation"]] = relationship( + "ThingAquiferAssociation", + back_populates="aquifer_system", + cascade="all, delete-orphan", + passive_deletes=True, + ) + + # One-To-Many: An AquiferSystem can be the target for many individual WellScreens. + well_screens: Mapped[List["WellScreen"]] = relationship( + "WellScreen", + back_populates="aquifer_system", + cascade="all, delete-orphan", + passive_deletes=True, + ) + + # --- Association Proxies --- + # Proxy to directly access Things (wells) associated with this AquiferSystem. + things: AssociationProxy[List["Thing"]] = association_proxy( + "thing_associations", "thing" + ) + # Proxy to directly access all AquiferTypes associated with this AquiferSystem. + aquifer_types: AssociationProxy[List["AquiferType"]] = association_proxy( + "thing_associations", "aquifer_types" + ) + + # --- Table Arguments --- + __table_args__ = (Index("ix_aquifersystem_name", "name"),) diff --git a/db/aquifer_type.py b/db/aquifer_type.py new file mode 100644 index 000000000..32900d801 --- /dev/null +++ b/db/aquifer_type.py @@ -0,0 +1,58 @@ +""" +SQLAlchemy model for the AquiferType table. + +This table stores the specific aquifer characteristics/types associated with +a Thing-AquiferSystem relationship. It allows capturing that a single aquifer +can have multiple characteristics simultaneously. + +Example: + A well in the "Ogallala" aquifer might tap portions that are both + "Fractured" AND "Confined". This would create: + - One AquiferSystem: "Ogallala" + - One ThingAquiferAssociation: linking well to Ogallala + - Two AquiferType records: "Fractured" and "Confined" +""" + +from typing import TYPE_CHECKING + +from sqlalchemy import ForeignKey +from sqlalchemy.orm import relationship, Mapped, mapped_column + +from db.base import Base, AutoBaseMixin, ReleaseMixin, lexicon_term + +if TYPE_CHECKING: + from db.thing_aquifer_association import ThingAquiferAssociation + + +class AquiferType(Base, AutoBaseMixin, ReleaseMixin): + """ + Represents the specific aquifer types/characteristics for a + Thing-AquiferSystem association. + + This allows modeling the fact that: + - A single aquifer can have multiple characteristics + - Different wells may tap different characteristics of the same aquifer + - Characteristics are attributes of the relationship, not the aquifer itself + + Fields from WellData CSV: + - AquiferType: May contain multiple codes (e.g., "FC" = Fractured + Confined) + - Each code becomes a separate AquiferType record + """ + + # --- Columns --- + thing_aquifer_association_id: Mapped[int] = mapped_column( + ForeignKey("thing_aquifer_association.id", ondelete="CASCADE"), + nullable=False, + comment="Links to the Thing-Aquifer association this type describes.", + ) + aquifer_type: Mapped[str] = lexicon_term( + nullable=False, + comment="Controlled vocabulary for aquifer hydrologic properties. " + "Examples: 'Unconfined', 'Confined', 'Perched', 'Fractured', 'Unconsolidated'.", + ) + + # --- Relationships --- + # Many-to-One: Multiple aquifer types can belong to one association + thing_aquifer_association: Mapped["ThingAquiferAssociation"] = relationship( + "ThingAquiferAssociation", back_populates="aquifer_types" + ) diff --git a/db/contact.py b/db/contact.py index 7855814fb..558724df9 100644 --- a/db/contact.py +++ b/db/contact.py @@ -26,7 +26,7 @@ from db.field import FieldEventParticipant, FieldEvent from db.thing import Thing from db.publication import Author, AuthorContactAssociation - from db.permission import Permission + from db.permission_history import PermissionHistory class ThingContactAssociation(Base, AutoBaseMixin): @@ -74,8 +74,10 @@ class Contact(Base, AutoBaseMixin, ReleaseMixin): ) # One-To-Many: A Contact can grant many Permissions. - permissions: Mapped[List["Permission"]] = relationship( - "Permission", back_populates="contact", cascade="all, delete, delete-orphan" + permissions: Mapped[List["PermissionHistory"]] = relationship( + "PermissionHistory", + back_populates="contact", + cascade="all, delete, delete-orphan", ) # One-To-Many: A Contact can be associated with many Authors (in Publications). author_associations: Mapped[List["AuthorContactAssociation"]] = relationship( diff --git a/db/data_provenance.py b/db/data_provenance.py index 06c468c8d..20505d94c 100644 --- a/db/data_provenance.py +++ b/db/data_provenance.py @@ -19,7 +19,7 @@ from sqlalchemy import Integer, Index, and_ from sqlalchemy.orm import relationship, Mapped, mapped_column, declared_attr, foreign -from db.base import Base, AutoBaseMixin, ReleaseMixin, pascal_to_snake +from db.base import Base, AutoBaseMixin, ReleaseMixin from db import lexicon_term @@ -53,9 +53,13 @@ class DataProvenance(AutoBaseMixin, ReleaseMixin, Base): ) # Values from the following NMAquifer tables are included as `origin_source` terms in the lexicon: # 'LU_DataSource', 'LU_Depth_CompletionSource'. - origin_source: Mapped[str] = lexicon_term( + origin_type: Mapped[str] = lexicon_term( nullable=True, - comment="Indicates the origin source of the data (e.g'Driller's Log', 'Well Report'.", + comment="Indicates the type of origin the data (e.g'Driller's Log', 'Well Report'.", + ) + origin_source: Mapped[str] = mapped_column( + nullable=True, + comment="The specific source of the data (e.g., 'J. Brown Thesis, \"I like APIs\", Pomona College, 1994').", ) # Values from the following NMAquifer tables are included as `collection_method` terms in the lexicon: # 'LU_AltitudeMethod','LU_CoordinateMethod'. @@ -116,7 +120,7 @@ def data_provenance(cls): "DataProvenance", primaryjoin=and_( cls.id == foreign(DataProvenance.target_id), - DataProvenance.target_table == pascal_to_snake(cls.__name__), + DataProvenance.target_table == cls.__tablename__, ), lazy="selectin", viewonly=True, diff --git a/db/geologic_formation.py b/db/geologic_formation.py new file mode 100644 index 000000000..2379f50f4 --- /dev/null +++ b/db/geologic_formation.py @@ -0,0 +1,82 @@ +""" +SQLAlchemy model for the GeologicFormation table. + +This table is a master reference table for geologic formations. Its purpose is to store definitions and descriptions +of various geologic formations that can be referenced by other tables in the database. +""" + +from typing import List, TYPE_CHECKING + +from sqlalchemy import Text, Index +from sqlalchemy.orm import relationship, Mapped, mapped_column +from sqlalchemy.ext.associationproxy import association_proxy, AssociationProxy +from geoalchemy2 import Geometry + +from db.base import Base, AutoBaseMixin, ReleaseMixin +from db.lexicon import lexicon_term + +from constants import SRID_WGS84 + +if TYPE_CHECKING: + from db.thing import Thing, WellScreen + from db.thing_geologic_formation_association import ( + ThingGeologicFormationAssociation, + ) + + +class GeologicFormation(Base, AutoBaseMixin, ReleaseMixin): + __versioned__ = {} + + # TODO: Let the API map formation codes to names using a formations.json file that can be periodically updated + # from the authoritative source (.e.g USGS). A placeholder `formations.json` file had been added to the `core` + # directory. + # name: Mapped[str] = mapped_column( + # nullable=False, + # unique=True, + # comment="The full, human-readable name of the geologic formation (e.g., 'Navajo Sandstone').", + # ) + formation_code: Mapped[str] = lexicon_term( + nullable=True, + unique=True, + comment="A short code or abbreviation for the geologic formation (e.g., '120ELRT').", + ) + description: Mapped[str] = mapped_column( + Text, + nullable=True, + comment="A detailed description of the geologic formation, its characteristics, and its significance.", + ) + # TODO: Implement controlled vocabularies for `lithology` using NMAquifer's 'LU_Lithology' table. + # This should be implemented after AMMP reviews and cleans up their formation terms and codes. + lithology: Mapped[str] = lexicon_term( + nullable=True, + comment="A controlled vocabulary for the primary, dominant rock type" + "(e.g., 'Tuff', 'Sandstone', 'Alluvium', 'Shale').", + ) + boundary: Mapped[Geometry] = mapped_column( + Geometry(geometry_type="MULTIPOLYGON", srid=SRID_WGS84, spatial_index=True), + nullable=True, + comment="A spatial representation of the geologic formation's extent.", + ) + + # --- Relationships --- + # One-To-Many (Association Object): A GeologicFormation can be associated with many Things (e.g., wells) via the + # ThingGeologicFormationAssociation join table. + thing_associations: Mapped[List["ThingGeologicFormationAssociation"]] = ( + relationship( + "ThingGeologicFormationAssociation", + back_populates="geologic_formation", + cascade="all, delete-orphan", + passive_deletes=True, + ) + ) + # One-To-Many: A GeologicFormation can have many physical WellScreens installed in it. + well_screens: Mapped[List["WellScreen"]] = relationship( + "WellScreen", back_populates="geologic_formation", passive_deletes=True + ) + + # --- Association Proxies --- + # Provides direct access to Things (wells) that penetrate this formation. + things: AssociationProxy["Thing"] = association_proxy("thing_associations", "thing") + + # --- Table Arguments --- + __table_args__ = (Index("ix_geologicformation_formation_code", "formation_code"),) diff --git a/db/notes.py b/db/notes.py index ab8384064..0e2e8ab8b 100644 --- a/db/notes.py +++ b/db/notes.py @@ -97,7 +97,7 @@ def notes(cls): "Notes", primaryjoin=and_( cls.id == foreign(Notes.target_id), - Notes.target_table == cls.__name__, + Notes.target_table == cls.__tablename__, ), cascade="all, delete-orphan", lazy="selectin", @@ -120,7 +120,7 @@ def add_note( content=content, note_type=note_type, target_id=self.id, - target_table=self.__class__.__name__, + target_table=self.__class__.__tablename__, release_status=release_status, ) diff --git a/db/permission_history.py b/db/permission_history.py new file mode 100644 index 000000000..591046bba --- /dev/null +++ b/db/permission_history.py @@ -0,0 +1,96 @@ +""" +models/permission.py + +This model defines the `Permission` table, a polymorphic table that tracks +all legal and administrative agreements related to site access and activity. +Its purpose is to track who granted permission, what activities they authorized, +which entity the permission applies to, and for what period of time. +""" + +from typing import TYPE_CHECKING +from datetime import date +from sqlalchemy import Integer, ForeignKey, String, and_ +from sqlalchemy.orm import relationship, Mapped, mapped_column, declared_attr, foreign + +from db.base import Base, AutoBaseMixin, ReleaseMixin, lexicon_term + + +if TYPE_CHECKING: + from db.contact import Contact + from db.thing import Thing + from db.location import Location + + +class PermissionHistory(Base, AutoBaseMixin, ReleaseMixin): + """ + Represents a specific grant of permission from a Contact for a + specific entity (e.g., a Thing or Location). + """ + + # --- Foreign Keys --- + contact_id: Mapped[int] = mapped_column( + Integer, ForeignKey("contact.id", ondelete="CASCADE"), nullable=False + ) + + # --- Columns --- + permission_type: Mapped[str] = lexicon_term(nullable=False) + permission_allowed: Mapped[bool] = mapped_column(nullable=False, default=False) + start_date: Mapped[date] = mapped_column(nullable=False) + end_date: Mapped[date] = mapped_column(nullable=True) + notes: Mapped[str] = mapped_column(nullable=True) + + # --- Polymorphic Columns --- + target_id: Mapped[int] = mapped_column(nullable=False) + target_table: Mapped[str] = mapped_column(String(50), nullable=False) + + # --- Relationships --- + # Many-To-One: A Permission is granted by one Contact. + contact: Mapped["Contact"] = relationship("Contact", back_populates="permissions") + + # --- Polymorphic Parent Relationships (Internal) --- + # These are view-only relationships used by the 'target' property below. + # They tell SQLAlchemy exactly how to find the specific parent record for a given child. + _thing_target: Mapped["Thing"] = relationship( + "Thing", + primaryjoin="and_(foreign(PermissionHistory.target_id) == Thing.id, " + "PermissionHistory.target_table == 'thing')", + viewonly=True, + ) + _location_target: Mapped["Location"] = relationship( + "Location", + primaryjoin="and_(foreign(PermissionHistory.target_id) == Location.id, " + "PermissionHistory.target_table == 'location')", + viewonly=True, + ) + + @property + def target(self): + """ + A generic property to get the parent object (Thing, Location, etc.). + This is useful for simplifying application code by providing a single, + consistent way to access the parent of a polymorphic record. + """ + return getattr(self, f"_{self.target_table}_target") + + +class PermissionHistoryMixin: + """ + Mixin for models that can have permissions (e.g., Thing, Location). + It automatically creates a polymorphic One-to-Many relationship to the + Permission table. + """ + + @declared_attr + def permission_history(cls): + # One-to-Many polymorphic relationship + return relationship( + "PermissionHistory", + primaryjoin=( + and_( + cls.id == foreign(PermissionHistory.target_id), + PermissionHistory.target_table == cls.__tablename__, + ) + ), + lazy="selectin", + viewonly=True, + ) diff --git a/db/status_history.py b/db/status_history.py index 8b3ee2321..15b5aec2f 100644 --- a/db/status_history.py +++ b/db/status_history.py @@ -19,7 +19,7 @@ ) from sqlalchemy.orm import Mapped, mapped_column, declared_attr, relationship, foreign -from db.base import Base, AutoBaseMixin, ReleaseMixin, lexicon_term, pascal_to_snake +from db.base import Base, AutoBaseMixin, ReleaseMixin, lexicon_term class StatusHistory(Base, AutoBaseMixin, ReleaseMixin): @@ -47,7 +47,7 @@ def status_history(cls): "StatusHistory", primaryjoin=and_( cls.id == foreign(StatusHistory.target_id), - StatusHistory.target_table == pascal_to_snake(cls.__name__), + StatusHistory.target_table == cls.__tablename__, ), cascade="all, delete-orphan", lazy="selectin", diff --git a/db/thing.py b/db/thing.py index 9f30d08e2..92c7bd942 100644 --- a/db/thing.py +++ b/db/thing.py @@ -26,8 +26,9 @@ AutoBaseMixin, Base, ReleaseMixin, - PermissionMixin, ) +from db.permission_history import PermissionHistoryMixin +from services.util import retrieve_latest_polymorphic_history_table_record from db.status_history import StatusHistoryMixin from db.measuring_point_history import MeasuringPointHistory from db.data_provenance import DataProvenanceMixin @@ -40,6 +41,12 @@ from db.sensor import Sensor from db.contact import Contact from db.group import Group, GroupThingAssociation + from db.aquifer_system import AquiferSystem + from db.thing_aquifer_association import ThingAquiferAssociation + from db.geologic_formation import GeologicFormation + from db.thing_geologic_formation_association import ( + ThingGeologicFormationAssociation, + ) class Thing( @@ -47,7 +54,7 @@ class Thing( AutoBaseMixin, ReleaseMixin, StatusHistoryMixin, - PermissionMixin, + PermissionHistoryMixin, DataProvenanceMixin, NotesMixin, ): @@ -64,10 +71,6 @@ class Thing( comment="To audit where the data came from in NM_Aquifer if it was transferred over", ) - # notes = mapped_column(Text, nullable=True) - # measuring_notes = mapped_column(Text, nullable=True) - # water_notes = mapped_column(Text, nullable=True) - # TODO: should `name` be unique? name: Mapped[str] = mapped_column( nullable=False, @@ -116,6 +119,32 @@ class Thing( well_construction_notes: Mapped[str] = mapped_column(Text, nullable=True) + well_completion_date: Mapped[date] = mapped_column( + nullable=True, comment="the date the well was completed if known" + ) + well_driller_name: Mapped[str] = mapped_column( + String(200), nullable=True, comment="Name of the well driller." + ) + well_construction_method: Mapped[str] = lexicon_term(nullable=True) + well_pump_type: Mapped[str] = lexicon_term(nullable=True) + well_pump_depth: Mapped[float] = mapped_column( + Float, + nullable=True, + info={"unit": "feet below ground surface"}, + comment="Depth of the well pump from ground surface to the pump intake (in feet).", + ) + formation_completion_code: Mapped[str] = lexicon_term( + nullable=True, + comment="The geologic formation in which the well was completed (from WellData.FormationZone). " + "This indicates the target formation for the well, not the full stratigraphic column. " + "For detailed depth-interval stratigraphy, see formation_associations.", + ) + # TODO: should this be required for every well in the database? AMMP review + is_suitable_for_datalogger: Mapped[bool] = mapped_column( + nullable=True, + comment="Indicates if the well is suitable for datalogger installation.", + ) + # Spring-related columns spring_type: Mapped[str] = lexicon_term( nullable=True, @@ -263,6 +292,26 @@ class Thing( lazy="joined", ) + # One-To-Many: A Thing can be associated with many AquiferSystems via the ThingAquiferAssociation join table. + aquifer_associations: Mapped[List["ThingAquiferAssociation"]] = relationship( + "ThingAquiferAssociation", + back_populates="thing", + cascade="all, delete-orphan", + passive_deletes=True, + lazy="joined", + ) + + # Many-To-Many: A Thing can penetrate many GeologicFormations. + formation_associations: Mapped[List["ThingGeologicFormationAssociation"]] = ( + relationship( + "ThingGeologicFormationAssociation", + back_populates="thing", + cascade="all, delete-orphan", + passive_deletes=True, + lazy="joined", + ) + ) + # --- Association Proxies --- assets: AssociationProxy[list["Asset"]] = association_proxy( "asset_associations", "asset" @@ -288,6 +337,16 @@ class Thing( "group_associations", "group" ) + # Proxy to directly access AquiferSystems associated with this Thing + aquifer_systems: AssociationProxy[List["AquiferSystem"]] = association_proxy( + "aquifer_associations", "aquifer_system" + ) + + # Proxy to directly access the GeologicFormations penetrated by this Thing. + geologic_formations: AssociationProxy[List["GeologicFormation"]] = ( + association_proxy("formation_associations", "geologic_formation") + ) + # Full-text search vector search_vector = Column(TSVectorType("name", "well_construction_notes")) @@ -379,7 +438,48 @@ def measuring_point_description(self) -> str | None: @property def well_depth_source(self) -> str | None: - return self._get_data_provenance_attribute("well_depth", "origin_source") + return self._get_data_provenance_attribute("well_depth", "origin_type") + + @property + def well_completion_date_source(self) -> str | None: + return self._get_data_provenance_attribute( + "well_completion_date", "origin_type" + ) + + @property + def well_construction_method_source(self) -> str | None: + return self._get_data_provenance_attribute( + "well_construction_method", "origin_source" + ) + + @property + def aquifers(self) -> List[dict]: + """ + Returns a list of aquifer systems and their associated types for this Thing. + Each aquifer system is represented as a dictionary with its name and a list of types. + """ + aquifer_list = [] + for association in self.aquifer_associations: + aquifer_info = { + "aquifer_system": association.aquifer_system.name, + "aquifer_types": [ + atype.aquifer_type for atype in association.aquifer_types + ], + } + aquifer_list.append(aquifer_info) + return aquifer_list + + @property + def permissions(self) -> list: + """ + Returns the associated permissions or an empty list. If there are no + associated permissions, an empty list is returned instead of None to + allow the API to serialize correctly (see schemas/thing.py). + """ + if self.permission_history: + return self.permission_history + else: + return [] class ThingIdLink(Base, AutoBaseMixin, ReleaseMixin): @@ -406,6 +506,12 @@ class WellScreen(Base, AutoBaseMixin, ReleaseMixin): thing_id: Mapped[int] = mapped_column( ForeignKey("thing.id", ondelete="CASCADE"), nullable=False ) + aquifer_system_id: Mapped[int] = mapped_column( + ForeignKey("aquifer_system.id", ondelete="SET NULL"), nullable=True + ) + geologic_formation_id: Mapped[int] = mapped_column( + ForeignKey("geologic_formation.id", ondelete="SET NULL"), nullable=True + ) screen_depth_top: Mapped[float] = mapped_column( info={"unit": "feet below ground surface"}, nullable=True ) @@ -423,6 +529,14 @@ class WellScreen(Base, AutoBaseMixin, ReleaseMixin): # Many-To-One: A WellScreen belongs to one Thing. thing: Mapped["Thing"] = relationship("Thing", back_populates="screens") + aquifer_system: Mapped["AquiferSystem"] = relationship( + "AquiferSystem", back_populates="well_screens", passive_deletes=True + ) + + geologic_formation: Mapped["GeologicFormation"] = relationship( + "GeologicFormation", back_populates="well_screens", passive_deletes=True + ) + class WellPurpose(Base, AutoBaseMixin, ReleaseMixin): """ diff --git a/db/thing_aquifer_association.py b/db/thing_aquifer_association.py new file mode 100644 index 000000000..cca5758a9 --- /dev/null +++ b/db/thing_aquifer_association.py @@ -0,0 +1,51 @@ +""" +SQLAlchemy model for the ThingAquiferAssociation table. + +This table is a join table (or "association object") whose purpose is to manage +the many-to-many relationship between a Thing and an AquiferSystem. +""" + +from typing import TYPE_CHECKING + +from sqlalchemy import ForeignKey + +from sqlalchemy.orm import relationship, Mapped, mapped_column + +from db.base import Base, AutoBaseMixin, ReleaseMixin + +if TYPE_CHECKING: + from db.thing import Thing + from db.aquifer_system import AquiferSystem + from db.aquifer_type import AquiferType + + +class ThingAquiferAssociation(Base, AutoBaseMixin, ReleaseMixin): + """ + Represents the association of a Thing to an AquiferSystem. This is an Association Object. + """ + + thing_id: Mapped[int] = mapped_column( + ForeignKey("thing.id", ondelete="CASCADE"), nullable=False + ) + aquifer_system_id: Mapped[int] = mapped_column( + ForeignKey("aquifer_system.id", ondelete="CASCADE"), nullable=False + ) + + # --- Relationship Definitions --- + # Many-To-One: This association links to one Thing. + thing: Mapped["Thing"] = relationship( + "Thing", back_populates="aquifer_associations", lazy="joined" + ) + + # Many-To-One: This association links to one AquiferSystem. + aquifer_system: Mapped["AquiferSystem"] = relationship( + "AquiferSystem", back_populates="thing_associations", lazy="joined" + ) + # One-To-Many: An association can have multiple aquifer types. + aquifer_types: Mapped[list["AquiferType"]] = relationship( + "AquiferType", + back_populates="thing_aquifer_association", + cascade="all, delete-orphan", + passive_deletes=True, + lazy="joined", + ) diff --git a/db/thing_geologic_formation_association.py b/db/thing_geologic_formation_association.py new file mode 100644 index 000000000..0707df269 --- /dev/null +++ b/db/thing_geologic_formation_association.py @@ -0,0 +1,60 @@ +""" +SQLAlchemy model for the ThingGeologicFormationAssociation table. + +This table is an association object that creates a many-to-many relationship between a Thing (well) and a +GeologicFormation. It stores the lithology for a well, detailing the depth intervals for each formation it penetrates. +""" + +from typing import TYPE_CHECKING + +from sqlalchemy import ForeignKey +from sqlalchemy.orm import relationship, Mapped, mapped_column + +from db.base import Base, AutoBaseMixin, ReleaseMixin + +if TYPE_CHECKING: + from db.thing import Thing + from db.geologic_formation import GeologicFormation + + +class ThingGeologicFormationAssociation(Base, AutoBaseMixin, ReleaseMixin): + """ + This is a= join table (Association Object). It represents the association of a Thing to a + GeologicFormation at a specific depth interval. + """ + + # --- Foreign Keys --- + thing_id: Mapped[int] = mapped_column( + ForeignKey("thing.id", ondelete="CASCADE"), + nullable=False, + comment="The foreign key linking this record to the `Thing` table." + "Deleting a `Thing` will cascade and delete its formation log.", + ) + geologic_formation_id: Mapped[int] = mapped_column( + ForeignKey("geologic_formation.id", ondelete="SET NULL"), + nullable=True, + comment="The foreign key linking this record to the `GeologicFormation` table." + "This is set to `SET NULL` on delete, as deleting a formation definition (a rare admin action)" + "should not delete the historical fact that a well had a pick at this depth.", + ) + + # Depth interval fields + top_depth: Mapped[float] = mapped_column( + nullable=False, + comment="The depth (in feet) to the top of the geologic formation, as measured from ground surface.", + ) + bottom_depth: Mapped[float] = mapped_column( + nullable=False, + comment="The depth (in feet) to the bottom of the geologic formation, as measured from ground surface.", + ) + + # --- Relationship Definitions --- + # Many-To-One: This association links to one Thing. + thing: Mapped["Thing"] = relationship( + "Thing", back_populates="formation_associations", lazy="joined" + ) + + # Many-To-One: This association links to one GeologicFormation. + geologic_formation: Mapped["GeologicFormation"] = relationship( + "GeologicFormation", back_populates="thing_associations", lazy="joined" + ) diff --git a/schemas/aquifer_system.py b/schemas/aquifer_system.py new file mode 100644 index 000000000..1e1961873 --- /dev/null +++ b/schemas/aquifer_system.py @@ -0,0 +1,51 @@ +from typing import List + +from pydantic import BaseModel +from schemas import BaseResponseModel +from schemas.validators import GeometryMixin +from core.enums import AquiferType, GeographicScale # Import specific Enums + + +# ------ CREATE ---------- +class CreateAquiferSystem(GeometryMixin): + """ + Schema for creating an aquifer system. + Used during data transfer and API creation. + """ + + name: str + description: str | None = None + primary_aquifer_type: AquiferType + geographic_scale: GeographicScale | None = None + # boundary field inherited from GeometryMixin + + +# ------ RESPONSE ---------- +class GeoJSONGeometry(BaseModel): + """ + Geometry schema for GeoJSON response. + """ + + type: str = "MULTIPOLYGON" + coordinates: List[List[List[float]]] + + +class GeoJSONProperties(BaseResponseModel): + """ + Response schema for aquifer system details. + """ + + name: str + description: str | None = None + primary_aquifer_type: AquiferType + geographic_scale: GeographicScale | None + + +class AquiferSystemGeoJSONResponse(BaseModel): + """ + Response schema for aquifer system details. + """ + + type: str = "Feature" + geometry: GeoJSONGeometry + properties: GeoJSONProperties diff --git a/schemas/geologic_formation.py b/schemas/geologic_formation.py new file mode 100644 index 000000000..67a3cb24a --- /dev/null +++ b/schemas/geologic_formation.py @@ -0,0 +1,88 @@ +from typing import List + +from pydantic import BaseModel, field_validator, Field + +from schemas import BaseResponseModel +from schemas.validators import DepthIntervalMixin, GeometryMixin +from core.enums import FormationCode, Lithology + + +# ------ CREATE ---------- +class CreateGeologicFormation(GeometryMixin): + """ + Schema for creating a geologic formation. + Used during data transfer and API creation. + """ + + # formation_code has its own custom uppercase validator + formation_code: FormationCode | None = None + description: str | None = None + lithology: Lithology | None = None + # boundary: inherited from GeometryMixin + + @field_validator("formation_code", mode="before") + @classmethod + def upper_case_code(cls, v: str | None) -> str | None: + """ + Automatically uppercase the formation code. + """ + if isinstance(v, str): + return v.upper() + return v + + +class CreateThingGeologicFormationAssociation(DepthIntervalMixin): + """ + Schema for linking a Thing (Well) to a GeologicFormation. + Uses DepthIntervalMixin to enforce bottom_depth > top_depth. + """ + + thing_id: int + geologic_formation_id: int + top_depth: float = Field(ge=0) + bottom_depth: float = Field(ge=0) + + +# ------ RESPONSE ---------- +class GeoJSONGeometry(BaseModel): + """ + Geometry schema for GeoJSON response. + """ + + type: str = "MULTIPOLYGON" + coordinates: List[List[List[float]]] + + +class GeoJSONProperties(BaseResponseModel): + """ + Response schema for geologic formation details. + """ + + formation_code: str | None = None + description: str | None = None + lithology: str | None = None + + +class GeologicFormationGeoJSONResponse(BaseModel): + """ + Response schema for geologic formation details. + """ + + type: str = "Feature" + geometry: GeoJSONGeometry + properties: GeoJSONProperties + + +class ThingGeologicFormationAssociationResponse(BaseResponseModel): + """ + Response schema for the association between a Thing and a GeologicFormation. + Includes depth interval information. + """ + + thing_id: int + geologic_formation_id: int | None = None + geologic_formation: GeologicFormationGeoJSONResponse | None = None + top_depth: float + top_depth_unit: str = "ft" + bottom_depth: float + bottom_depth_unit: str = "ft" diff --git a/schemas/permission_history.py b/schemas/permission_history.py new file mode 100644 index 000000000..e0619d90e --- /dev/null +++ b/schemas/permission_history.py @@ -0,0 +1,18 @@ +from pydantic import BaseModel +from schemas import PastOrTodayDate + +from core.enums import PermissionType + + +# ------ RESPONSE ---------- +class PermissionHistoryResponse(BaseModel): + """ + Even though permission_allowed and start_date are not-nullable in the + database, they are nullable here to accommodate cases where no permission + record exists for a given permission type. + """ + + permission_type: PermissionType + permission_allowed: bool | None + start_date: PastOrTodayDate | None + end_date: PastOrTodayDate | None diff --git a/schemas/thing.py b/schemas/thing.py index cf8c3ef2b..7a7982494 100644 --- a/schemas/thing.py +++ b/schemas/thing.py @@ -24,12 +24,17 @@ ScreenType, Organization, MonitoringFrequency, + Organization, + MonitoringFrequency, + WellConstructionMethod, + WellPumpType, + FormationCode, ) from schemas import BaseCreateModel, BaseUpdateModel, BaseResponseModel, PastOrTodayDate from schemas.group import GroupResponse from schemas.location import LocationGeoJSONResponse from schemas.notes import NoteResponse, CreateNote - +from schemas.permission_history import PermissionHistoryResponse # -------- VALIDATE ---------- @@ -128,8 +133,16 @@ class CreateWell(CreateBaseThing, ValidateWell): measuring_point_height: float = Field( ge=0, description="Measuring point height in feet" ) - measuring_point_description: str | None + measuring_point_description: str | None = None notes: list[CreateNote] | None = None + well_completion_date: PastOrTodayDate | None = None + well_completion_date_source: str | None = None + well_driller_name: str | None = None + well_construction_method: WellConstructionMethod | None = None + well_construction_method_source: str | None = None + well_pump_type: WellPumpType | None = None + is_suitable_for_datalogger: bool | None + formation_completion_code: FormationCode | None = None class CreateSpring(CreateBaseThing): @@ -146,6 +159,8 @@ class CreateWellScreen(BaseCreateModel): """ thing_id: int + aquifer_system_id: int | None = None + geologic_formation_id: int | None = None screen_depth_bottom: float = Field(gt=0, description="Screen depth bottom in feet") screen_depth_top: float = Field(gt=0, description="Screen depth top in feet") screen_type: ScreenType | None = None @@ -220,14 +235,25 @@ class WellResponse(BaseThingResponse): well_casing_depth_unit: str = "ft" well_casing_materials: list[CasingMaterial] = [] well_construction_notes: str | None = None + well_completion_date: PastOrTodayDate | None + well_completion_date_source: str | None + well_driller_name: str | None + well_construction_method: WellConstructionMethod | None + well_construction_method_source: str | None + well_pump_type: WellPumpType | None + well_pump_depth: float | None + well_pump_depth_unit: str = "ft" + is_suitable_for_datalogger: bool | None well_status: str | None measuring_point_height: float measuring_point_height_unit: str = "ft" measuring_point_description: str | None - + aquifers: list[dict] = [] water_notes: list[NoteResponse] | None = None measuring_notes: list[NoteResponse] | None = None general_notes: list[NoteResponse] | None = None + permissions: list[PermissionHistoryResponse] + formation_completion_code: FormationCode | None @field_validator("well_purposes", mode="before") def populate_well_purposes_with_strings(cls, well_purposes): @@ -248,6 +274,43 @@ def populate_well_casing_materials_with_strings(cls, well_casing_materials): materials = [] return materials + @field_validator("permissions", mode="before") + def populate_permission_history_with_latest_records(cls, permissions): + """ + Populate the permission history with the latest records for each + type of permission. If multiple records exist for the same permission type + only the most recent one is included. If there are no records + the permission_allowed will be None + """ + permissions_to_return = [] + for permission_type in [ + "Water Level Sample", + "Water Chemistry Sample", + "Datalogger Installation", + ]: + # Filter records for the current permission type + filtered_records = [ + record + for record in permissions + if record.permission_type == permission_type and record.end_date is None + ] + if filtered_records: + # Get the most recent record based on start_date + latest_record = max( + filtered_records, key=lambda record: record.start_date + ) + permissions_to_return.append(latest_record) + else: + permissions_to_return.append( + PermissionHistoryResponse( + permission_type=permission_type, + permission_allowed=None, + start_date=None, + end_date=None, + ) + ) + return permissions_to_return + class SpringResponse(BaseThingResponse): """ @@ -269,6 +332,11 @@ class WellScreenResponse(BaseResponseModel): thing_id: int thing: WellResponse + aquifer_system_id: int | None = None + aquifer_system: str | None = None + aquifer_type: str | None = None + geologic_formation_id: int | None = None + geologic_formation: str | None = None screen_depth_bottom: float screen_depth_bottom_unit: str = "ft" screen_depth_top: float @@ -276,6 +344,24 @@ class WellScreenResponse(BaseResponseModel): screen_type: str | None = None screen_description: str | None = None + @field_validator("aquifer_system", mode="before") + def populate_aquifer_system_with_name(cls, aquifer_system): + if aquifer_system is not None: + return aquifer_system.name + return None + + @field_validator("aquifer_type", mode="before") + def populate_aquifer_type_with_name(cls, aquifer_type): + if aquifer_type is not None: + return aquifer_type.name + return None + + @field_validator("geologic_formation", mode="before") + def populate_geologic_formation_with_code(cls, geologic_formation): + if geologic_formation is not None: + return geologic_formation.formation_code + return None + class GeoJSONGeometry(BaseModel): """ @@ -342,6 +428,8 @@ class UpdateThingIdLink(BaseUpdateModel): class UpdateWellScreen(BaseUpdateModel): + aquifer_system_id: int | None = None + geologic_formation_id: int | None = None screen_depth_bottom: float | None = None screen_depth_top: float | None = None screen_description: str | None = None diff --git a/schemas/validators.py b/schemas/validators.py new file mode 100644 index 000000000..963047bc2 --- /dev/null +++ b/schemas/validators.py @@ -0,0 +1,43 @@ +""" +schemas/validators.py +Reusable Pydantic validators and mixins for aquifer and geology related schemas. +May consider expansion for other domain models in the future. +""" + +from pydantic import model_validator, field_validator, BaseModel, Field +from services.validation.geospatial import validate_wkt_geometry + + +class DepthIntervalMixin(BaseModel): + """ + Mixin to enforce: + 1. Depths are non-negative (via Field constraints). + 2. Bottom depth > top depth (via model_validator). + Assumes the model has 'top_depth' and 'bottom_depth' fields. + """ + + top_depth: float = Field(ge=0) + bottom_depth: float = Field(ge=0) + + @model_validator(mode="after") + def check_depth_logical_order(self) -> "DepthIntervalMixin": + if self.bottom_depth <= self.top_depth: + raise ValueError( + f"Bottom depth ({self.bottom_depth}) must be greater " + f"than top depth ({self.top_depth})" + ) + return self + + +class GeometryMixin(BaseModel): + """ + Mixin to validate WKT strings for boundary fields. + Delegates logic to the validate_wkt_geometry service function. + """ + + boundary: str | None = None + + @field_validator("boundary") + @classmethod + def validate_wkt(cls, v: str | None) -> str | None: + return validate_wkt_geometry(v) diff --git a/services/thing_helper.py b/services/thing_helper.py index 53ce54577..fdd0424db 100644 --- a/services/thing_helper.py +++ b/services/thing_helper.py @@ -13,6 +13,9 @@ # See the License for the specific language governing permissions and # limitations under the License. # =============================================================================== +from datetime import datetime +from zoneinfo import ZoneInfo + from fastapi import Request from fastapi_pagination.ext.sqlalchemy import paginate from pydantic import BaseModel @@ -32,6 +35,7 @@ WellCasingMaterial, ) from db.group import GroupThingAssociation +from db.measuring_point_history import MeasuringPointHistory from services.audit_helper import audit_add from services.crud_helper import model_patcher from services.exceptions_helper import PydanticStyleException @@ -159,6 +163,10 @@ def add_thing( location_id = data.pop("location_id", None) group_id = data.pop("group_id", None) + # Extract measuring point data (stored in separate history table, not as Thing columns) + measuring_point_height = data.pop("measuring_point_height", None) + measuring_point_description = data.pop("measuring_point_description", None) + try: thing = Thing(**data) thing.thing_type = thing_type @@ -169,6 +177,18 @@ def add_thing( session.flush() session.refresh(thing) + # Create MeasuringPointHistory record if measuring_point_height provided + if measuring_point_height is not None: + measuring_point_history = MeasuringPointHistory( + thing_id=thing.id, + measuring_point_height=measuring_point_height, + measuring_point_description=measuring_point_description, + start_date=datetime.now(tz=ZoneInfo("UTC")), + end_date=None, + ) + audit_add(user, measuring_point_history) + session.add(measuring_point_history) + # endpoint catches ProgrammingError if location_id or group_id do not exist if group_id: assoc = GroupThingAssociation() diff --git a/services/util.py b/services/util.py index 313a922ec..e9ec08a94 100644 --- a/services/util.py +++ b/services/util.py @@ -1,13 +1,14 @@ import json import os - -import httpx -import pyproj from shapely.ops import transform +import pyproj +import httpx +from sqlalchemy.orm import DeclarativeBase from sqlalchemy.orm import DeclarativeBase from constants import SRID_WGS84 + TRANSFORMERS = {} METERS_TO_FEET = 3.28084 @@ -59,6 +60,13 @@ def convert_ft_to_m(feet: float | None) -> float | None: return round(feet / METERS_TO_FEET, 6) +def convert_m_to_ft(meters: float | None) -> float | None: + """Convert a length from meters to feet.""" + if meters is None: + return None + return round(meters * METERS_TO_FEET, 6) + + def get_tiger_data( lon: float, lat: float, layer: int, outfields: str = "*" ) -> dict | None: @@ -197,11 +205,10 @@ def retrieve_latest_polymorphic_history_table_record( DeclarativeBase | None The latest record from the specified polymorphic table with the defined type if it exists. """ - if polymorphic_relationship == "permissions": + if polymorphic_relationship == "permission_history": type_field = "permission_type" elif polymorphic_relationship == "status_history": type_field = "status_type" - polymorphic_records = getattr(target_record, polymorphic_relationship) type_polymorphic_records = [ r diff --git a/tests/features/environment.py b/tests/features/environment.py index 9b801e9d7..afbc2d13c 100644 --- a/tests/features/environment.py +++ b/tests/features/environment.py @@ -28,12 +28,20 @@ Parameter, Deployment, TransducerObservationBlock, + WellCasingMaterial, + PermissionHistory, + Contact, StatusHistory, ThingIdLink, WellPurpose, MeasuringPointHistory, MonitoringFrequencyHistory, DataProvenance, + AquiferSystem, + AquiferType, + ThingAquiferAssociation, + GeologicFormation, + ThingGeologicFormationAssociation, ) from db.engine import session_ctx @@ -87,9 +95,13 @@ def add_well(context, session, location, name_num): well_construction_notes="Test well construction notes", well_casing_diameter=5.0, well_casing_depth=10.0, - # notes="These are some test well notes", - # measuring_notes="These are some measuring notes", - # water_notes="This are some water notes", + well_completion_date="2013-05-15", + well_driller_name="Jonsi", + well_construction_method="Driven", + well_pump_type="Submersible", + well_pump_depth=8, + is_suitable_for_datalogger=True, + formation_completion_code="000EXRV", ) session.add(well) @@ -116,6 +128,20 @@ def add_well(context, session, location, name_num): return well +@add_context_object_container("well_casing_materials") +def add_well_casing_material(context, session, well): + wcm = WellCasingMaterial( + thing_id=well.id, + material="PVC", + ) + session.add(wcm) + session.commit() + session.refresh(wcm) + + context.objects["well_casing_materials"].append(wcm) + return wcm + + @add_context_object_container("well_purposes") def add_well_purpose(context, session, well, purpose_term): purpose = WellPurpose(thing=well, purpose=purpose_term) @@ -189,6 +215,54 @@ def add_spring(context, session, location, name_num): return spring +@add_context_object_container("contacts") +def add_contact(context, session): + contact = Contact( + name="Test Contact", + role="Software Developer", + organization="NMBGMR", + release_status="draft", + contact_type="Primary", + ) + session.add(contact) + session.commit() + session.refresh(contact) + + context.objects["contacts"].append(contact) + return contact + + +@add_context_object_container("permission_histories") +def add_permission_history( + context, + session, + contact_id, + permission_type, + permission_allowed, + start_date, + end_date, + notes, + target_id, + target_table, +): + permission_history = PermissionHistory( + contact_id=contact_id, + permission_type=permission_type, + permission_allowed=permission_allowed, + start_date=start_date, + end_date=end_date, + notes=notes, + target_id=target_id, + target_table=target_table, + ) + session.add(permission_history) + session.commit() + session.refresh(permission_history) + + context.objects["permission_histories"].append(permission_history) + return permission_history + + @add_context_object_container("sensors") def add_sensor(context, session): sensor = Sensor( @@ -317,7 +391,8 @@ def add_data_provenance( target_id, target_table, field_name, - origin_source, + origin_type=None, + origin_source=None, collection_method=None, accuracy_value=None, accuracy_unit=None, @@ -327,6 +402,7 @@ def add_data_provenance( collection_method=collection_method, target_id=target_id, target_table=target_table, + origin_type=origin_type, origin_source=origin_source, accuracy_value=accuracy_value, accuracy_unit=accuracy_unit, @@ -353,6 +429,71 @@ def add_transducer_observation(context, session, block, deployment_id, value): return obs +@add_context_object_container("aquifer_systems") +def add_aquifer_system(context, session, name, well): + aquifer_system = AquiferSystem( + name=name, + description="this is a test aquifer", + primary_aquifer_type="Artesian", + geographic_scale="Major", + boundary="MULTIPOLYGON(((0 0, 1 1, 2 2, 3 3, 1 2, 0 0)))", + ) + session.add(aquifer_system) + session.commit() + session.refresh(aquifer_system) + + context.objects["aquifer_systems"].append(aquifer_system) + return aquifer_system + + +@add_context_object_container("thing_aquifer_associations") +def add_thing_aquifer_association(context, session, well, aquifer_system): + association = ThingAquiferAssociation(thing=well, aquifer_system=aquifer_system) + session.add(association) + session.commit() + session.refresh(association) + + context.objects["thing_aquifer_associations"].append(association) + return association + + +@add_context_object_container("aquifer_types") +def add_aquifer_type(context, session, aquifer_type_str, thing_aquifer_association): + aquifer_type = AquiferType( + aquifer_type=aquifer_type_str, + thing_aquifer_association=thing_aquifer_association, + ) + session.add(aquifer_type) + session.commit() + session.refresh(aquifer_type) + + context.objects["aquifer_types"].append(aquifer_type) + return aquifer_type + + +@add_context_object_container("geologic_formations") +def add_geologic_formation(context, session, formation_code, well): + formation = GeologicFormation( + formation_code=formation_code, + description="This is a test geologic formation.", + lithology="Peat", + boundary="MULTIPOLYGON(((0 0, 1 1, 2 2, 3 3, 1 2, 0 0)))", + ) + session.add(formation) + session.commit() + session.refresh(formation) + + association = ThingGeologicFormationAssociation( + top_depth=1, bottom_depth=10, thing=well, geologic_formation=formation + ) + session.add(association) + session.commit() + session.refresh(association) + + context.objects["geologic_formations"].append(formation) + return formation + + def before_all(context): context.objects = {} rebuild = False @@ -374,133 +515,145 @@ def before_all(context): sensor_1 = add_sensor(context, session) deployment = add_deployment(context, session, well_1.id, sensor_1.id) - measuring_point_history_1 = add_measuring_point_history( - context, session, well=well_1 - ) - measuring_point_history_2 = add_measuring_point_history( - context, session, well=well_2 - ) - measuring_point_history_3 = add_measuring_point_history( - context, session, well=well_3 - ) - - well_status_1 = add_status_history( - context, - session, - status_type="Well Status", - status_value="Active, pumping well", - start_date=datetime(2020, 1, 1), - end_date=datetime(2021, 1, 1), - reason="Initial status", - target_id=context.objects["wells"][0].id, - target_table="thing", - ) - - well_status_2 = add_status_history( - context, - session, - status_type="Well Status", - status_value="Destroyed, exists but not usable", - start_date=datetime(2021, 1, 1), - end_date=None, - reason="Roving bovine", - target_id=context.objects["wells"][0].id, - target_table="thing", - ) - - monitoring_status_1 = add_status_history( - context, - session, - status_type="Monitoring Status", - status_value="Currently monitored", - start_date=datetime(2020, 1, 1), - end_date=datetime(2021, 1, 1), - reason="Initial monitoring status", - target_id=context.objects["wells"][0].id, - target_table="thing", - ) - - monitoring_status_2 = add_status_history( - context, - session, - status_type="Monitoring Status", - status_value="Not currently monitored", - start_date=datetime(2021, 1, 1), - end_date=None, - reason="Roving bovine destroyed well", - target_id=context.objects["wells"][0].id, - target_table="thing", - ) - - monitoring_frequency_history_1 = add_monitoring_frequency_history( - context, - session, - well=well_1, - monitoring_frequency="Monthly", - start_date="2020-01-01", - end_date="2021-01-01", - ) - - monitoring_frequency_history_2 = add_monitoring_frequency_history( - context, - session, - well=well_1, - monitoring_frequency="Annual", - start_date="2020-01-01", - end_date=None, - ) - - id_link_1 = add_id_link( - context, - session, - thing=well_1, - relation="same_as", - alternate_id="12345678", - alternate_organization="USGS", - ) - - id_link_2 = add_id_link( - context, - session, - thing=well_1, - relation="same_as", - alternate_id="OSE-0001", - alternate_organization="NMOSE", - ) - - id_link_3 = add_id_link( - context, - session, - thing=well_1, - relation="same_as", - alternate_id="Roving Bovine Ranch Well #1", - alternate_organization="NMBGMR", - ) - - group = add_group(context, session, [well_1, well_2]) - - elevation_method = add_data_provenance( - context, - session, - target_id=loc_1.id, - target_table="location", - field_name="elevation", - origin_source="Private geologist, consultant or univ associate", - collection_method="LiDAR DEM", - ) - - well_depth_source = add_data_provenance( - context, - session, - target_id=well_1.id, - target_table="thing", - field_name="well_depth", - origin_source="Other", - ) + add_well_casing_material(context, session, well_1) + + contact = add_contact(context, session) + + for permission in [ + "Datalogger Installation", + "Water Level Sample", + "Water Chemistry Sample", + ]: + add_permission_history( + context, + session, + contact_id=context.objects["contacts"][0].id, + permission_type=permission, + permission_allowed=True, + start_date=datetime(2025, 1, 1).date(), + end_date=None, + notes=f"Permission granted for {permission.lower()}.", + target_id=well_1.id, + target_table="thing", + ) + + for well in (well_1, well_2, well_3): + add_measuring_point_history(context, session, well=well) + for value, start, end in ( + ( + "Active, pumping well", + datetime(2020, 1, 1), + datetime(2021, 1, 1), + "initial status", + ), + ( + "Destroyed, exists but not usable", + datetime(2021, 1, 1), + None, + "roving bovine", + ), + ): + add_status_history( + context, + session, + status_type="Well Status", + status_value=value, + start_date=start, + end_date=end, + reason="Initial status", + target_id=context.objects["wells"][0].id, + target_table="thing", + ) + + for value, start, end in ( + ("Currently monitored", datetime(2020, 1, 1), datetime(2021, 1, 1)), + ("Not currently monitored", datetime(2021, 1, 1), None), + ): + add_status_history( + context, + session, + status_type="Monitoring Status", + status_value=value, + start_date=start, + end_date=end, + reason="Initial monitoring status", + target_id=context.objects["wells"][0].id, + target_table="thing", + ) + + for f, start, end in ( + ("Monthly", "2020-01-01", "2021-01-01"), + ("Annual", "2020-01-01", None), + ): + add_monitoring_frequency_history( + context, + session, + well=well_1, + monitoring_frequency=f, + start_date=start, + end_date=end, + ) + + for aid, aorg in ( + ("12345678", "USGS"), + ("OSE-0001", "NMOSE"), + ("Roving Bovine Ranch Well #1", "NMBGMR"), + ): + add_id_link( + context, + session, + thing=well_1, + relation="same_as", + alternate_id=aid, + alternate_organization=aorg, + ) + + add_well_casing_material(context, session, well_1) + + add_group(context, session, [well_1, well_2]) + + for kwargs in ( + { + "target_id": loc_1.id, + "target_table": "location", + "field_name": "elevation", + "origin_source": "Private geologist, consultant or univ associate", + "collection_method": "LiDAR DEM", + }, + { + "target_id": well_1.id, + "target_table": "thing", + "field_name": "well_depth", + "origin_type": "Other", + }, + { + "target_id": well_1.id, + "target_table": "thing", + "field_name": "well_completion_date", + "origin_type": "Data Portal", + }, + { + "target_id": well_1.id, + "target_table": "thing", + "field_name": "well_construction_method", + "origin_source": "Jacob's 2013 Thesis", + }, + ): + add_data_provenance(context, session, **kwargs) for purpose in ["Domestic", "Irrigation"]: add_well_purpose(context, session, well_1, purpose) + for name in ["Aquifer A", "Aquifer B"]: + system = add_aquifer_system(context, session, name, well_1) + add_thing_aquifer_association(context, session, well_1, system) + + for t in ["Artesian", "Fractured"]: + taa = context.objects["thing_aquifer_associations"][0] + add_aquifer_type(context, session, t, taa) + + add_geologic_formation(context, session, "000EXRV", well_1) + # parameter ID can be hardcoded because init_parameter always creates the same one parameter = session.get(Parameter, 1) block = add_block(context, session, parameter) @@ -519,8 +672,10 @@ def before_all(context): def after_all(context): with session_ctx() as session: for table in context.objects.values(): - for obj in table: - session.delete(obj) + for record in table: + obj = session.get(record.__class__, record.id) + if obj: + session.delete(obj) session.commit() diff --git a/tests/features/steps/well-additional-information.py b/tests/features/steps/well-additional-information.py new file mode 100644 index 000000000..8b00f7eb7 --- /dev/null +++ b/tests/features/steps/well-additional-information.py @@ -0,0 +1,270 @@ +from behave import then + +from services.util import retrieve_latest_polymorphic_history_table_record + + +# ------------------------------------------------------------------------------ +# Permissions / Operational OK flags +# ------------------------------------------------------------------------------ +@then( + "the response should include whether repeat measurement permission is granted for the well" +) +def step_impl(context): + permission_type = "Water Level Sample" + assert "permissions" in context.water_well_data + + permission_record = retrieve_latest_polymorphic_history_table_record( + context.objects["wells"][0], "permission_history", permission_type + ) + + water_well_data_permissions = [ + p + for p in context.water_well_data["permissions"] + if p["permission_type"] == permission_type + ][0] + assert ( + water_well_data_permissions["permission_type"] + == permission_record.permission_type + ) + assert ( + water_well_data_permissions["permission_allowed"] + == permission_record.permission_allowed + ) + assert water_well_data_permissions[ + "start_date" + ] == permission_record.start_date.strftime("%Y-%m-%d") + if permission_record.end_date: + assert water_well_data_permissions[ + "end_date" + ] == permission_record.end_date.strftime("%Y-%m-%d") + else: + assert water_well_data_permissions["end_date"] is None + + +@then("the response should include whether sampling permission is granted for the well") +def step_impl(context): + permission_type = "Water Chemistry Sample" + assert "permissions" in context.water_well_data + + permission_record = retrieve_latest_polymorphic_history_table_record( + context.objects["wells"][0], "permission_history", permission_type + ) + + water_well_data_permissions = [ + p + for p in context.water_well_data["permissions"] + if p["permission_type"] == permission_type + ][0] + assert ( + water_well_data_permissions["permission_type"] + == permission_record.permission_type + ) + assert ( + water_well_data_permissions["permission_allowed"] + == permission_record.permission_allowed + ) + assert water_well_data_permissions[ + "start_date" + ] == permission_record.start_date.strftime("%Y-%m-%d") + if permission_record.end_date: + assert water_well_data_permissions[ + "end_date" + ] == permission_record.end_date.strftime("%Y-%m-%d") + else: + assert water_well_data_permissions["end_date"] is None + + +@then( + "the response should include whether datalogger installation permission is granted for the well" +) +def step_impl(context): + permission_type = "Datalogger Installation" + assert "permissions" in context.water_well_data + + permission_record = retrieve_latest_polymorphic_history_table_record( + context.objects["wells"][0], "permission_history", permission_type + ) + + water_well_data_permissions = [ + p + for p in context.water_well_data["permissions"] + if p["permission_type"] == permission_type + ][0] + assert ( + water_well_data_permissions["permission_type"] + == permission_record.permission_type + ) + assert ( + water_well_data_permissions["permission_allowed"] + == permission_record.permission_allowed + ) + assert water_well_data_permissions[ + "start_date" + ] == permission_record.start_date.strftime("%Y-%m-%d") + if permission_record.end_date: + assert water_well_data_permissions[ + "end_date" + ] == permission_record.end_date.strftime("%Y-%m-%d") + else: + assert water_well_data_permissions["end_date"] is None + + +# ------------------------------------------------------------------------------ +# Well Construction Information +# ------------------------------------------------------------------------------ + + +@then("the response should include the completion date of the well") +def step_impl(context): + assert "well_completion_date" in context.water_well_data + assert context.water_well_data["well_completion_date"] == context.objects["wells"][ + 0 + ].well_completion_date.strftime("%Y-%m-%d") + + +@then("the response should include the source of the completion information") +def step_impl(context): + assert "well_completion_date_source" in context.water_well_data + + assert ( + context.water_well_data["well_completion_date_source"] + == context.objects["wells"][0].well_completion_date_source + ) + + +@then("the response should include the driller name") +def step_impl(context): + assert "well_driller_name" in context.water_well_data + assert ( + context.water_well_data["well_driller_name"] + == context.objects["wells"][0].well_driller_name + ) + + +@then("the response should include the construction method") +def step_impl(context): + assert "well_construction_method" in context.water_well_data + assert ( + context.water_well_data["well_construction_method"] + == context.objects["wells"][0].well_construction_method + ) + + +@then("the response should include the source of the construction information") +def step_impl(context): + assert "well_construction_method_source" in context.water_well_data + assert ( + context.water_well_data["well_construction_method_source"] + == context.objects["wells"][0].well_construction_method_source + ) + + +# ------------------------------------------------------------------------------ +# Additional Well Physical Properties +# ------------------------------------------------------------------------------ + + +@then("the response should include the casing diameter in inches") +def step_impl(context): + assert "well_casing_diameter" in context.water_well_data + assert "well_casing_diameter_unit" in context.water_well_data + + assert ( + context.water_well_data["well_casing_diameter"] + == context.objects["wells"][0].well_casing_diameter + ) + assert context.water_well_data["well_casing_diameter_unit"] == "in" + + +@then("the response should include the casing depth in feet below ground surface") +def step_impl(context): + assert "well_casing_depth" in context.water_well_data + assert "well_casing_depth_unit" in context.water_well_data + + assert ( + context.water_well_data["well_casing_depth"] + == context.objects["wells"][0].well_casing_depth + ) + assert context.water_well_data["well_casing_depth_unit"] == "ft" + + +@then("the response should include the casing materials") +def step_impl(context): + assert "well_casing_materials" in context.water_well_data + assert set(context.water_well_data["well_casing_materials"]) == { + m.material for m in context.objects["wells"][0].well_casing_materials + } + + +@then("the response should include the well pump type (previously well_type field)") +def step_impl(context): + assert "well_pump_type" in context.water_well_data + assert ( + context.water_well_data["well_pump_type"] + == context.objects["wells"][0].well_pump_type + ) + + +@then("the response should include the well pump depth in feet (new field)") +def step_impl(context): + assert "well_pump_depth" in context.water_well_data + assert "well_pump_depth_unit" in context.water_well_data + + assert ( + context.water_well_data["well_pump_depth"] + == context.objects["wells"][0].well_pump_depth + ) + assert context.water_well_data["well_pump_depth_unit"] == "ft" + + +@then( + "the response should include whether the well is open and suitable for a datalogger" +) +def step_impl(context): + assert "is_suitable_for_datalogger" in context.water_well_data + assert ( + context.water_well_data["is_suitable_for_datalogger"] + == context.objects["wells"][0].is_suitable_for_datalogger + ) + + +# ------------------------------------------------------------------------------ +# Aquifer/ Geology Information +# ------------------------------------------------------------------------------ + + +@then( + "the response should include the formation as the formation zone of well completion" +) +def step_impl(context): + assert "formation_completion_code" in context.water_well_data + assert ( + context.water_well_data["formation_completion_code"] + == context.objects["wells"][0].formation_completion_code + ) + + +@then( + "the response should include the aquifer class code to classify the aquifer into aquifer system." +) +def step_impl(context): + for aquifer in context.water_well_data["aquifers"]: + assert "aquifer_system" in aquifer + assert {a.get("aquifer_system") for a in context.water_well_data["aquifers"]} == { + system.name for system in context.objects["aquifer_systems"] + } + + +@then( + "the response should include the aquifer type as the type of aquifers penetrated by the well" +) +def step_impl(context): + for aquifer in context.water_well_data["aquifers"]: + assert "aquifer_types" in aquifer + + if aquifer["aquifer_system"] == "Aquifer A": + assert set(aquifer["aquifer_types"]) == { + a.aquifer_type for a in context.objects["aquifer_types"] + } + else: + assert aquifer["aquifer_types"] == [] diff --git a/tests/features/steps/well-core-information.py b/tests/features/steps/well-core-information.py index b0adc8346..1f56161f6 100644 --- a/tests/features/steps/well-core-information.py +++ b/tests/features/steps/well-core-information.py @@ -163,7 +163,7 @@ def step_impl(context): and r.target_table == "thing" and r.target_id == context.objects["wells"][0].id ] - well_depth_source = well_depth_source_records[0].origin_source + well_depth_source = well_depth_source_records[0].origin_type assert context.water_well_data["well_depth_source"] == well_depth_source diff --git a/tests/test_thing.py b/tests/test_thing.py index 378f72d02..5bd504718 100644 --- a/tests/test_thing.py +++ b/tests/test_thing.py @@ -152,6 +152,38 @@ def test_add_water_well(location, group): cleanup_post_test(Thing, data["id"]) +@pytest.mark.skip( + "This duplicates the test above. That one will need to eventually be updated" +) +def test_add_water_well_with_measuring_point(location, group): + """ + Test creating a well with measuring_point_height and measuring_point_description. + + This reproduces the bug where measuring_point fields are properties (from MeasuringPointHistory table) + and cannot be set directly on Thing objects. + + Expected error (before fix): AttributeError: property 'measuring_point_height' of 'Thing' object has no setter + """ + payload = { + "location_id": location.id, + "group_id": group.id, + "release_status": "draft", + "name": "Test Well with Measuring Point", + "measuring_point_height": 2.5, + "measuring_point_description": "top of casing", + } + + response = client.post("/thing/water-well", json=payload) + assert response.status_code == 201 + data = response.json() + + assert data["name"] == payload["name"] + assert data["measuring_point_height"] == 2.5 + assert data["measuring_point_description"] == "top of casing" + + cleanup_post_test(Thing, data["id"]) + + @pytest.mark.skip("Needs to be updated per changes made from feature files") def test_add_water_well_409_bad_group_id(location): bad_group_id = 9999 diff --git a/transfers/aquifer_system_transfer.py b/transfers/aquifer_system_transfer.py new file mode 100644 index 000000000..a0ba1f02e --- /dev/null +++ b/transfers/aquifer_system_transfer.py @@ -0,0 +1,141 @@ +import time +from sqlalchemy.orm import Session +from pydantic import ValidationError + +from db import AquiferSystem +from schemas.aquifer_system import CreateAquiferSystem +from transfers.util import read_csv, replace_nans, logger + + +def transfer_aquifer_systems(session: Session, limit: int = None) -> tuple: + """ + Transfer aquifer system data from LU_AquiferClass CSV to the database. + + This creates the master list of named aquifer systems (e.g., Ogallala Aquifer). the primary_type field is set + to "Unknown" as a placeholder and will be updated during well transfer when we know what type each well encounters. + + This should be run BEFORE well_transfer.py so that aquifer records exist for wells to reference. + + Args: + session (Session): SQLAlchemy database session + limit (int, optional): Limit the number of records to transfer (for testing). + + Returns: + tuple: (input_df, cleaned_df, errors) + """ + # 1. Read the CSV file + input_df = read_csv("LU_AquiferClass") + + # 2. Replace NaNs with NOne + cleaned_df = replace_nans(input_df) + + # 3. Initialize tracking variables for logging + n = len(input_df) + step = 25 + start_time = time.time() + errors = [] + created_count = 0 + skipped_count = 0 + + logger.info(f"Starting transfer of {n} aquifer systems from LU_AquiferClass.") + + # 4. Process each row + for i, row in enumerate(cleaned_df.itertuples()): + # check if limit is reached + if limit and i >= limit: + logger.info(f"Reached limit of {limit} rows. Stopping migration.") + break + + # Log progress every 'step' rows + if i and not i % step: + logger.info( + f"Processing row {i} of {n}. Avg rows per second: {step / (time.time() - start_time):.2f}" + ) + start_time = time.time() + + # Commit progress periodically + try: + session.commit() + except Exception as e: + logger.critical(f"Error committing aquifer system {i}: {e}") + session.rollback() + continue + + # 5. Extract aquifer code and name + aquifer_code = row.CODE + aquifer_name = row.MEANING + + if not aquifer_name: + error_msg = f"Row {i} (code: {aquifer_code}) has no aquifer name (MEANING)." + logger.critical(error_msg) + errors.append({"row": i, "code": aquifer_code, "error": error_msg}) + skipped_count += 1 + continue + + # 6. Check if aquifer system already exists + existing = ( + session.query(AquiferSystem) + .filter(AquiferSystem.name == aquifer_name) + .first() + ) + + if existing: + logger.info( + f"Aquifer '{aquifer_name}' (code: {aquifer_code}) already exists. Skipping." + ) + skipped_count += 1 + continue + + # 7. Prepare data dictionary + try: + data = CreateAquiferSystem( + name=aquifer_name, + description=None, # can be updated later + primary_aquifer_type="Unknown", # placeholder - will be updated during well transfer + ) + + # Validate data using Pydantic schema + CreateAquiferSystem.model_validate(data) + + except ValidationError as e: + errors.append( + {"code": aquifer_code, "name": aquifer_name, "error": e.errors()} + ) + logger.critical( + f"Error creating aquifer system '{aquifer_name}' (code: {aquifer_code}) (row {i}): {e.errors()}" + ) + continue + + # 8. Create database record + aquifer_system = None + try: + aquifer_data = data.model_dump() + aquifer_system = AquiferSystem(**aquifer_data) + session.add(aquifer_system) + created_count += 1 + + logger.info( + f"Created aquifer system: {aquifer_system.name} (code: {aquifer_code})" + ) + + except Exception as e: + if aquifer_system is not None: + session.expunge(aquifer_system) + errors.append({"code": aquifer_code, "name": aquifer_name, "error": str(e)}) + logger.critical( + f"Error creating aquifer system record '{aquifer_name}': {e}" + ) + continue + + # 9. Final commit + try: + session.commit() + logger.info( + f"Successfully transferred {created_count} aquifer systems, skipped {skipped_count}. " + f"Note: primary_type set to 'Unknown' and will be updated during well transfer." + ) + except Exception as e: + logger.critical(f"Error in final commit: {e}") + session.rollback() + + return input_df, cleaned_df, errors diff --git a/transfers/data/owners_organization_mapper.json b/transfers/data/owners_organization_mapper.json index 5ce45a8bf..b4f29bd7b 100644 --- a/transfers/data/owners_organization_mapper.json +++ b/transfers/data/owners_organization_mapper.json @@ -89,6 +89,7 @@ "Pecos Trail Inn": "Pecos Trail Inn", "Pelican Spa": "Pelican Spa", "Pistachio Tree Ranch": "Pistachio Tree Ranch", + "Quemado Mutual Water and Sewage Works Association": "Quemado Municipal Water & SWA", "Rancho Encantado": "Rancho Encantado", "Rancho San Lucas": "Rancho San Lucas", "Rancho San Marcos": "Rancho San Marcos", diff --git a/transfers/geologic_formation_transfer.py b/transfers/geologic_formation_transfer.py new file mode 100644 index 000000000..7fcd73e4c --- /dev/null +++ b/transfers/geologic_formation_transfer.py @@ -0,0 +1,141 @@ +import time +from sqlalchemy.orm import Session +from pydantic import ValidationError + +from db import GeologicFormation +from schemas.geologic_formation import CreateGeologicFormation +from transfers.util import read_csv, replace_nans, logger + + +def transfer_geologic_formations(session: Session, limit: int = None) -> tuple: + """ + Transfer geologic formation data from LU_GeologicFormation CSV to the database. + + This should be run BEFORE well_transfer.py so that geologic formation records exist for wells to reference. + + Args: + session (Session): SQLAlchemy database session + limit (int, optional): Optional limit on number of records to transfer (for testing). + + Returns: + tuple: (input_df, cleaned_df, errors) + """ + # 1. Read the CSV file + input_df = read_csv("LU_Formations") + + # 2. Replace NaNs with None + cleaned_df = replace_nans(input_df) + + # 3. Initialize tracking variables for logging + n = len(cleaned_df) + step = 25 + start_time = time.time() + errors = [] + created_count = 0 + skipped_count = 0 + + logger.info(f"Starting transfer of {n} geologic formations") + + # 4. Process each row + for i, row in enumerate(cleaned_df.itertuples()): + # check if limit is reached + if limit and i >= limit: + logger.info(f"Reached limit of {limit} rows. Stopping migration.") + break + + # Log progress every 'step' rows + if i and not i % step: + logger.info( + f"Processing row {i} of {n}. Avg rows per second: {step / (time.time() - start_time):.2f}" + ) + start_time = time.time() + + # Commit progress periodically + try: + session.commit() + except Exception as e: + logger.critical(f"Error committing geologic formations: {e}") + session.rollback() + continue + + # 5. Extract formation code and description + formation_code = row.Code + + if not formation_code: + logger.warning(f"Skipping row {i}: Missing formation code") + skipped_count += 1 + continue + + # Check if this formation already exists + existing = ( + session.query(GeologicFormation) + .filter(GeologicFormation.formation_code == formation_code) + .first() + ) + + if existing: + logger.info( + f"Skipping row {i}: Formation code {formation_code} already exists" + ) + skipped_count += 1 + continue + + # 6. Prepare data for creation + # Note: We only store the formation_code. Formation names will be mapped by the API using a + # formations.json file from authoritative sources (e.g., USGS). + # The description field is left as None and can be populated later if needed. + # Note: lithology is set to None here and will be updated during stratigraphy transfer + try: + data = CreateGeologicFormation( + formation_code=formation_code, + description=None, # Not storing from legacy data + lithology=None, # Will be populated from Stratigraphy.csv + ) + + # Validate the data using Pydantic schema + CreateGeologicFormation.model_validate(data) + + except ValidationError as e: + errors.append({"code": formation_code, "errors": e.errors()}) + logger.critical( + f"Validation error for row {i} with Code {formation_code}: {e.errors()}" + ) + continue + except Exception as e: + errors.append({"code": formation_code, "errors": str(e)}) + logger.critical(f"Error preparing data for {formation_code}: {e}") + continue + + # 7. Create database object + geologic_formation = None + try: + formation_data = data.model_dump() + geologic_formation = GeologicFormation(**formation_data) + session.add(geologic_formation) + created_count += 1 + + logger.info( + f"Created geologic formation: {geologic_formation.formation_code}" + ) + + except Exception as e: + if geologic_formation is not None: + session.expunge(geologic_formation) + errors.append({"code": formation_code, "error": str(e)}) + logger.critical( + f"Error creating geologic formation for {formation_code}: {e}" + ) + continue + + # 8. Final commit + try: + session.commit() + logger.info( + f"Successfully transferred {created_count} geologic formations, skipped {skipped_count}. " + f"Note: lithology is None and will be updated during stratigraphy transfer." + ) + except Exception as e: + logger.critical(f"Error during final commit of geologic formations: {e}") + session.rollback() + + return input_df, cleaned_df, errors diff --git a/transfers/permissions_transfer.py b/transfers/permissions_transfer.py new file mode 100644 index 000000000..18daa1040 --- /dev/null +++ b/transfers/permissions_transfer.py @@ -0,0 +1,95 @@ +from sqlalchemy.orm import Session +from datetime import datetime +from pandas import isna + +from db import Thing, PermissionHistory +from transfers.util import read_csv, logger, replace_nans + +""" +Developer's notes + +According to Laila the column WellData.OpenWellLoggerOK only pertains to the +physical properties of a well (that is, if a datalogger can be installed). It +does not pertain to permissions. +""" + + +def transfer_permissions(session: Session): + """ + The transferred wells and contacts need to be transferred first + - to access the auto-generated well IDs + - to know who gave permission to which well since contact_id is required for + PermissionHistory + """ + wdf = read_csv("WellData", dtype={"OSEWelltagID": str}) + wdf = replace_nans(wdf) + + transferred_wells = ( + session.query(Thing).filter(Thing.thing_type == "water well").all() + ) + + for well in transferred_wells: + if len(well.contacts) == 0: + logger.critical( + f"Well {well.name} has no associated contacts; skipping permission transfer." + ) + continue + else: + # Assuming the first contact is the relevant one + contact_id = well.contacts[0].id + + allow_water_level_samples = wdf.loc[ + wdf["PointID"] == well.name, "MonitorOK" + ].values + if len(allow_water_level_samples) == 0: + pass + elif isna(allow_water_level_samples[0]): + pass + else: + try: + permission_allowed = bool(allow_water_level_samples[0]) + permission = PermissionHistory( + contact_id=contact_id, + permission_type="Water Level Sample", + permission_allowed=permission_allowed, + start_date=datetime.today().date(), + target_id=well.id, + target_table="thing", + ) + session.add(permission) + logger.info( + f"Transferred Water Level Sample permission for well {well.name}: {permission_allowed}." + ) + except Exception as e: + logger.error(f"Error transferring permission for well {well.name}: {e}") + session.rollback() + pass + + allow_water_chemistry_samples = wdf.loc[ + wdf["PointID"] == well.name, "SampleOK" + ].values + if len(allow_water_chemistry_samples) == 0: + pass + elif isna(allow_water_chemistry_samples[0]): + pass + else: + try: + permission_allowed = bool(allow_water_chemistry_samples[0]) + permission = PermissionHistory( + contact_id=contact_id, + permission_type="Water Chemistry Sample", + permission_allowed=permission_allowed, + start_date=datetime.today().date(), + target_id=well.id, + target_table="thing", + ) + session.add(permission) + logger.info( + f"Transferred Water Chemistry Sample permission for well {well.name}: {permission_allowed}." + ) + except Exception as e: + logger.error(f"Error transferring permission for well {well.name}: {e}") + session.rollback() + pass + + session.commit() diff --git a/transfers/stratigraphy_transfer.py b/transfers/stratigraphy_transfer.py new file mode 100644 index 000000000..de51e354e --- /dev/null +++ b/transfers/stratigraphy_transfer.py @@ -0,0 +1,285 @@ +""" +Transfer script for stratigraphy (lithology log) data. + +This creates ThingGeologicFormationAssociation records from the Stratigraphy CSV, which contains depth-specific +formation information for wells. It also updates the GeologicFormation.lithology field based on the +Stratigraphy.Lithology data. +""" + +import time +from sqlalchemy.orm import Session + +from db import Thing, GeologicFormation, ThingGeologicFormationAssociation +from transfers.util import ( + read_csv, + replace_nans, + filter_to_valid_point_ids, + lexicon_mapper, + logger, +) + + +def transfer_stratigraphy(session: Session, limit: int = None) -> tuple: + """ + Transfer detailed stratigraphy (lithology log) data from Stratigraphy CSV. + + The Stratigraphy CSV contains multiple rows per well, each representing a + depth interval, the formation encountered, and its lithology. + + Fields used: + - PointID: Links to the well + - UnitIdentifier: Formation code (maps to LU_Formations) + - StratTop: Top depth of the layer (feet below ground surface) + - StratBottom: Bottom depth of the layer (feet below ground surface) + - Lithology: Lithology code (maps to LU_Lithology via ABBREVIATION field) + + This should be run AFTER: + 1. transfer_geologic_formations.py (so formations exist) + 2. transfer_wells.py (so wells exist) + + Args: + session: Database session + limit: Optional limit on number of WELLS to process (for testing) + + Returns: + tuple: (input_df, cleaned_df, errors) + """ + # 1. Read and clean data + input_df = read_csv("Stratigraphy") + cleaned_df = replace_nans(input_df) + + # Step 2: Filter to only wells that exist in database + cleaned_df = filter_to_valid_point_ids(session, cleaned_df) + + n_records = len(cleaned_df) + n_wells = len(cleaned_df["PointID"].unique()) + + logger.info( + f"Starting transfer of {n_records} stratigraphy records for {n_wells} wells" + ) + + # 3. Initialize tracking variables for logging + step = 25 + start_time = time.time() + errors = [] + created_count = 0 + skipped_count = 0 + lithology_updates = 0 + + # Step 4: Group by well for efficient processing + well_groups = cleaned_df.groupby("PointID") + + for well_index, (pointid, strat_group) in enumerate(well_groups): + # Check limit (on number of wells, not records) + if limit and well_index >= limit: + logger.info(f"Reached limit of {limit} wells. Stopping.") + break + + # Progress logging every 25 wells + if well_index and not well_index % step: + logger.info( + f"Processing well {well_index} of {n_wells}, " + f"avg wells per second: {step / (time.time() - start_time):.2f}" + ) + start_time = time.time() + + # Periodic commit + try: + session.commit() + except Exception as e: + logger.critical(f"Error committing stratigraphy records: {e}") + session.rollback() + continue + + # 5. Get the well from database + thing = session.query(Thing).filter(Thing.name == pointid).first() + if not thing: + logger.warning( + f"Well {pointid} not found in database, skipping stratigraphy" + ) + skipped_count += len(strat_group) + continue + + logger.info( + f"Processing {len(strat_group)} stratigraphy layers for well {pointid}" + ) + + # 6. Process each stratigraphy record for this well + for layer_index, row in enumerate(strat_group.itertuples()): + # Validate required fields + # UnitIdentifier + if not hasattr(row, "UnitIdentifier") or not row.UnitIdentifier: + logger.critical( + f"Stratigraphy record {layer_index} for {pointid} has no UnitIdentifier, skipping" + ) + skipped_count += 1 + errors.append( + { + "pointid": pointid, + "layer": layer_index, + "error": "Missing UnitIdentifier", + } + ) + continue + # StratTop + if not hasattr(row, "StratTop") or row.StratTop is None: + logger.critical( + f"Stratigraphy record {layer_index} for {pointid} has no StratTop, skipping" + ) + skipped_count += 1 + errors.append( + { + "pointid": pointid, + "layer": layer_index, + "error": "Missing StratTop", + } + ) + continue + # StratBottom + if not hasattr(row, "StratBottom") or row.StratBottom is None: + logger.critical( + f"Stratigraphy record {layer_index} for {pointid} has no StratBottom, skipping" + ) + skipped_count += 1 + errors.append( + { + "pointid": pointid, + "layer": layer_index, + "error": "Missing StratBottom", + } + ) + continue + + # Extract formation code + formation_code = row.UnitIdentifier.strip() + + # Validate depth values + try: + top_depth = float(row.StratTop) + bottom_depth = float(row.StratBottom) + except (ValueError, TypeError) as e: + error_msg = f"Invalid depth values: StratTop={row.StratTop}, StratBottom={row.StratBottom}" + logger.critical( + f"{pointid} layer {layer_index}: {error_msg}, error: {e}" + ) + errors.append( + { + "pointid": pointid, + "layer": layer_index, + "error": error_msg, + "details": str(e), # for conversion errors + } + ) + skipped_count += 1 + continue + + # Validate depth logic + if top_depth >= bottom_depth: + error_msg = ( + f"Invalid depth logic: top={top_depth} >= bottom={bottom_depth}" + ) + logger.critical(f"{pointid} layer {layer_index}: {error_msg}") + errors.append( + {"pointid": pointid, "layer": layer_index, "error": error_msg} + ) + skipped_count += 1 + continue + + if top_depth < 0: + error_msg = f"Negative top depth: {top_depth}" + logger.critical(f"{pointid} layer {layer_index}: {error_msg}") + errors.append( + {"pointid": pointid, "layer": layer_index, "error": error_msg} + ) + skipped_count += 1 + continue + + # 7. Get or create the formation + formation = ( + session.query(GeologicFormation) + .filter(GeologicFormation.formation_code == formation_code) + .first() + ) + + if not formation: + # Create new formation if it doesn't exist + logger.info(f"Creating new geologic formation: {formation_code}") + formation = GeologicFormation( + formation_code=formation_code, + description=None, + lithology=None, # Will be set below + ) + session.add(formation) + session.flush() + + # 8. Update formation lithology if available and not already set + if hasattr(row, "Lithology") and row.Lithology: + try: + # Map lithology code to geologic_formation.lithology using ABBREVIATION field + lithology = lexicon_mapper.map_value( + f"LU_Lithology:{row.Lithology}" + ) + + # Update if formation does not have lithology yet + if not formation.lithology: + formation.lithology = lithology + lithology_updates += 1 + logger.info(f"Set lithology for {formation_code}: {lithology}") + elif formation.lithology != lithology: + # Log if there's a mismatch (different lithology for same formation) + logger.warning( + f"Formation {formation_code} has conflicting lithology: " + f"existing='{formation.lithology}', new='{lithology}'." + ) + except KeyError: + logger.warning( + f"Unknown lithology code '{row.Lithology}' for {pointid}, skipping lithology update" + ) + except Exception as e: + logger.warning(f"Error mapping lithology '{row.Lithology}': {e}") + + # 9. Create ThingGeologicFormationAssociation record + try: + formation_assoc = ThingGeologicFormationAssociation( + thing=thing, + geologic_formation=formation, + top_depth=top_depth, + bottom_depth=bottom_depth, + ) + session.add(formation_assoc) + created_count += 1 + + logger.info( + f" Layer {layer_index + 1}: {formation.formation_code} " + f"from {top_depth:.1f} to {bottom_depth:.1f} ft" + ) + + except Exception as e: + logger.critical( + f"Error creating stratigraphy association for {pointid}, " + f"formation {formation_code}: {e}" + ) + errors.append( + { + "pointid": pointid, + "formation": formation_code, + "layer": layer_index, + "error": str(e), + } + ) + skipped_count += 1 + continue + + # 10. Final commit + try: + session.commit() + logger.info( + f"Successfully transferred stratigraphy: " + f"{created_count} associations created, {skipped_count} skipped, " + f"{lithology_updates} lithology fields updated, {len(errors)} errors" + ) + except Exception as e: + logger.critical(f"Error in final commit: {e}") + session.rollback() + + return input_df, cleaned_df, errors diff --git a/transfers/util.py b/transfers/util.py index d459ee4ff..24389bc6d 100644 --- a/transfers/util.py +++ b/transfers/util.py @@ -326,6 +326,9 @@ def get_transferable_wells( # get all the pointids from the well photos and include them wellphotos_df = read_csv("WellPhotos") wellphotos_pointids = wellphotos_df["PointID"].unique().tolist() + + # get all pointids that have owner info + pointids = list(set(usgs_pointids + collabnet_pointids + wellphotos_pointids)) return df[df["DataSource"].isin(valid_datasources) | df["PointID"].isin(pointids)] @@ -470,7 +473,7 @@ def make_location_data_provenance( ) -> list[DataProvenance]: provenance_records = [] - if row.AltitudeAccuracy or row.CoordinateAccuracy: + if row.AltitudeAccuracy: provenance = DataProvenance( target_id=location.id, target_table="location", @@ -564,7 +567,6 @@ def make_location_data_provenance( target_id=location.id, target_table="location", field_name="point", - origin_source=None, collection_method=coordinate_method, accuracy_value=accuracy_value, accuracy_unit=accuracy_unit, @@ -617,6 +619,8 @@ def _make_lu_to_lexicon_mapper(self) -> dict[str, str]: # Lookup tables where CODE maps to MEANING lu_tables = [ "LU_AltitudeMethod", + "LU_AquiferClass", + "LU_AquiferType", "LU_CollectionMethod", "LU_ConstructionMethod", "LU_CoordinateAccuracy", @@ -626,7 +630,9 @@ def _make_lu_to_lexicon_mapper(self) -> dict[str, str]: "LU_DataSource", "LU_Depth_CompletionSource", "LU_Discharge_ChemistrySource", + "LU_Formations", "LU_LevelStatus", + "LU_Lithology", "LU_MajorAnalyte", "LU_MeasurementMethod", "LU_MinorTraceAnalyte", @@ -645,6 +651,9 @@ def _make_lu_to_lexicon_mapper(self) -> dict[str, str]: if lu_table == "LU_Formations": code = row.Code meaning = row.Meaning + elif lu_table == "LU_Lithology": + code = row.ABBREVIATION + meaning = row.TERM else: code = row.CODE meaning = row.MEANING diff --git a/transfers/well_transfer.py b/transfers/well_transfer.py index fa912ed18..eed5c3eb8 100644 --- a/transfers/well_transfer.py +++ b/transfers/well_transfer.py @@ -14,8 +14,11 @@ # limitations under the License. # =============================================================================== import json + +# import time from datetime import datetime, UTC +# import re import pandas as pd from pandas import isna from pydantic import ValidationError @@ -35,6 +38,11 @@ StatusHistory, MonitoringFrequencyHistory, MeasuringPointHistory, + # DataProvenance, + # AquiferSystem, + # AquiferType, + # GeologicFormation, + # ThingAquiferAssociation, ) from schemas.thing import CreateWell, CreateWellScreen from services.gcs_helper import get_storage_bucket From b68900ed3965d9e57cff47f63a16f2a85890cf09 Mon Sep 17 00:00:00 2001 From: jakeross Date: Tue, 2 Dec 2025 20:57:44 -0700 Subject: [PATCH 31/66] refactor: enhance transfer process by adding aquifer system and geologic formation transfers, improving logging and error handling --- core/lexicon.json | 371 +++++++++++++++++++++++++++++++++ services/util.py | 8 +- transfers/transfer.py | 7 + transfers/transferer.py | 1 + transfers/util.py | 15 +- transfers/well_transfer.py | 416 +++++++++++++++++++++++++++++++------ 6 files changed, 751 insertions(+), 67 deletions(-) diff --git a/core/lexicon.json b/core/lexicon.json index 815a40d2f..142f1745c 100644 --- a/core/lexicon.json +++ b/core/lexicon.json @@ -714,6 +714,377 @@ {"categories": ["geographic_scale"], "term": "Regional", "definition": "Important aquifers serving regions"}, {"categories": ["geographic_scale"], "term": "Local", "definition": "Smaller, locally important aquifers"}, {"categories": ["geographic_scale"], "term": "Minor", "definition": "Limited extent or yield"}, + {"categories": ["formation_code"],"term": "000EXRV","definition": "Extrusive Rocks"}, + {"categories": ["formation_code"],"term": "000IRSV","definition": "Intrusive Rocks"}, + {"categories": ["formation_code"],"term": "050QUAL","definition": "Quaternary Alluvium in Valleys"}, + {"categories": ["formation_code"],"term": "100QBAS","definition": "Quaternary basalt"}, + {"categories": ["formation_code"],"term": "110ALVM","definition": "Quaternary Alluvium"}, + {"categories": ["formation_code"],"term": "110AVMB","definition": "Alluvium, Bolson Deposits and Other Surface Deposits"}, + {"categories": ["formation_code"],"term": "110BLSN","definition": "Bolson Fill"}, + {"categories": ["formation_code"],"term": "110NTGU","definition": "Naha and Tsegi Alluvium Deposits, undifferentiated"}, + {"categories": ["formation_code"],"term": "110PTODC","definition": "Pediment, Terrace and Other Deposits of Gravel, Sand and Caliche"}, + {"categories": ["formation_code"],"term": "111MCCR","definition": "McCathys Basalt Flow"}, + {"categories": ["formation_code"],"term": "112ANCH","definition": "Upper Santa Fe Group, Ancha Formation (QTa)"}, + {"categories": ["formation_code"],"term": "112CURB","definition": "Cuerbio Basalt"}, + {"categories": ["formation_code"],"term": "112LAMA","definition": "Lama Formation (QTl, QTbh) and other mountain front alluvial fans"}, + {"categories": ["formation_code"],"term": "112LAMAb","definition": "Lama Fm (QTl, QTbh) between Servilleta Basalts"}, + {"categories": ["formation_code"],"term": "112LGUN","definition": "Laguna Basalt Flow"}, + {"categories": ["formation_code"],"term": "112QTBF","definition": "Quaternary-Tertiary basin fill (not in valleys)"}, + {"categories": ["formation_code"],"term": "112QTBFlac","definition": "Quaternary-Tertiary basin fill, lacustrian-playa lithofacies"}, + {"categories": ["formation_code"],"term": "112QTBFpd","definition": "Quaternary-Tertiary basin fill, distal piedmont lithofacies"}, + {"categories": ["formation_code"],"term": "112QTBFppm","definition": "Quaternary-Tertiary basin fill, proximal and medial piedmont lithofacies"}, + {"categories": ["formation_code"],"term": "112SNTF","definition": "Santa Fe Group, undivided"}, + {"categories": ["formation_code"],"term": "112SNTFA","definition": "Upper Santa Fe Group, axial facies"}, + {"categories": ["formation_code"],"term": "112SNTFOB","definition": "Upper SantaFe Group, Loma Barbon member of Arroyo Ojito Formatin"}, + {"categories": ["formation_code"],"term": "112SNTFP","definition": "Upper Santa Fe Group, piedmont facies"}, + {"categories": ["formation_code"],"term": "112TRTO","definition": "Tuerto Gravels (QTt)"}, + {"categories": ["formation_code"],"term": "120DTIL","definition": "Datil Formation"}, + {"categories": ["formation_code"],"term": "120ELRT","definition": "El Rito Formation"}, + {"categories": ["formation_code"],"term": "120IRSV","definition": "Tertiary Intrusives"}, + {"categories": ["formation_code"],"term": "120SBLC","definition": "Sierra Blanca Volcanics, undivided"}, + {"categories": ["formation_code"],"term": "120SRVB","definition": "Tertiary Servilletta Basalts (Tsb)"}, + {"categories": ["formation_code"],"term": "120SRVBf","definition": "Tertiary Servilletta Basalts, fractured (Tsbf)"}, + {"categories": ["formation_code"],"term": "120TSBV_Lower","definition": "Tertiary Sierra Blanca area lower volcanic unit (Hog Pen Fm)"}, + {"categories": ["formation_code"],"term": "120TSBV_Upper","definition": "Tertiary Sierra Blanca area upper volcanic unit (above Hog Pen Fm)"}, + {"categories": ["formation_code"],"term": "121CHMT","definition": "Chamita Formation (Tc)"}, + {"categories": ["formation_code"],"term": "121CHMTv","definition": "Chamita Fm, Vallito member (Tcv)"}, + {"categories": ["formation_code"],"term": "121CHMTvs","definition": "Chamita Fm, sandy Vallito member (Tcvs)"}, + {"categories": ["formation_code"],"term": "121OGLL","definition": "Ogallala Formation"}, + {"categories": ["formation_code"],"term": "121PUYEF","definition": "Puye Conglomerate, Fanglomerate Member"}, + {"categories": ["formation_code"],"term": "121TSUQ","definition": "Tesuque Formation, undifferentiated unit"}, + {"categories": ["formation_code"],"term": "121TSUQa","definition": "Tesuque Fm lithosome A (Tta)"}, + {"categories": ["formation_code"],"term": "121TSUQacu","definition": "Tesuque Fm (upper), Cuarteles member lithosome A (Ttacu)"}, + {"categories": ["formation_code"],"term": "121TSUQacuf","definition": "Tesuque Fm (upper), fine-grained Cuarteles member lithosome A (Ttacuf)"}, + {"categories": ["formation_code"],"term": "121TSUQaml","definition": "Tesuque Fm lower-middle lithosome A (Ttaml)"}, + {"categories": ["formation_code"],"term": "121TSUQb","definition": "Tesuque Fm lithosome B (Ttb)"}, + {"categories": ["formation_code"],"term": "121TSUQbfl","definition": "Tesuque Fm lower lithosome B, basin-floor deposits (Ttbfl)"}, + {"categories": ["formation_code"],"term": "121TSUQbfm","definition": "Tesuque Fm middle lithosome B, basin-floor deposits (Ttbfm)"}, + {"categories": ["formation_code"],"term": "121TSUQbp","definition": "Tesuque Fm lithosome B, Pojoaque member (Ttbp)"}, + {"categories": ["formation_code"],"term": "121TSUQce","definition": "Tesuque Fm, Cejita member (Ttce)"}, + {"categories": ["formation_code"],"term": "121TSUQe","definition": "Tesuque Fm lithosome E (Tte)"}, + {"categories": ["formation_code"],"term": "121TSUQs","definition": "Tesuque Fm lithosome S (Tts)"}, + {"categories": ["formation_code"],"term": "121TSUQsa","definition": "Tesuque Fm lateral gradation lithosomes S and A (Ttsag)"}, + {"categories": ["formation_code"],"term": "121TSUQsc","definition": "Tesuque Fm coarse-grained lithosome S (Ttsc)"}, + {"categories": ["formation_code"],"term": "121TSUQsf","definition": "Tesuque Fm, fine-grained lithosome S (Ttsf)"}, + {"categories": ["formation_code"],"term": "122CHOC","definition": "Chamita and Ojo Caliente interlayered (Ttoc)"}, + {"categories": ["formation_code"],"term": "122CRTO","definition": "Chama El Rito Formation (Tesuque member, Ttc)"}, + {"categories": ["formation_code"],"term": "122OJOC","definition": "Ojo Caliente Formation (Tesuque member, Tto)"}, + {"categories": ["formation_code"],"term": "122PICR","definition": "Picuris Tuff"}, + {"categories": ["formation_code"],"term": "122PPTS","definition": "Popotosa Formation"}, + {"categories": ["formation_code"],"term": "122SNTFP","definition": "Lower Santa Fe Group, piedmont facies"}, + {"categories": ["formation_code"],"term": "123DTILSPRS","definition": "Datil Group ignimbrites and lavas and Spears Group, interbedded"}, + {"categories": ["formation_code"],"term": "123DTMGandbas","definition": "Datil and Mogollon Group andesite, basaltic andesite, and basalt flows"}, + {"categories": ["formation_code"],"term": "123DTMGign","definition": "Datil and Mogollon Group ignimbrites"}, + {"categories": ["formation_code"],"term": "123DTMGrhydac","definition": "Datil and Mogollon Group rhyolite and dacite flows"}, + {"categories": ["formation_code"],"term": "123ESPN","definition": "T Espinaso Formation (Te)"}, + {"categories": ["formation_code"],"term": "123GLST","definition": "T Galisteo Formation"}, + {"categories": ["formation_code"],"term": "123PICS","definition": "T Picuris Formation (Tp)"}, + {"categories": ["formation_code"],"term": "123PICSc","definition": "T Picuris Formation, basal conglomerate (Tpc)"}, + {"categories": ["formation_code"],"term": "123PICSl","definition": "T lower Picuris Formation (Tpl)"}, + {"categories": ["formation_code"],"term": "123SPRSDTMGlava","definition": "Spears Group and Datil-Mogollon intermediate-mafic lavas, interbedded"}, + {"categories": ["formation_code"],"term": "123SPRSlower","definition": "Spears Group, lower part; tuffaceous, gravelly debris and mud flows"}, + {"categories": ["formation_code"],"term": "123SPRSmid_uppe","definition": "Spears Group, middle-upper part; excludes Dog Spring Formation"}, + {"categories": ["formation_code"],"term": "124BACA","definition": "Baca Formation"}, + {"categories": ["formation_code"],"term": "124CBMN","definition": "Cub Mountain Formation"}, + {"categories": ["formation_code"],"term": "124LLVS","definition": "Llaves Member of San Jose Formation"}, + {"categories": ["formation_code"],"term": "124PSCN","definition": "Poison Canyon Formation"}, + {"categories": ["formation_code"],"term": "124RGIN","definition": "Regina Member of San Jose Formation"}, + {"categories": ["formation_code"],"term": "124SNJS","definition": "San Jose Formation"}, + {"categories": ["formation_code"],"term": "124TPCS","definition": "TapicitosMember of San Jose Formation"}, + {"categories": ["formation_code"],"term": "125NCMN","definition": "Nacimiento Formation"}, + {"categories": ["formation_code"],"term": "125NCMNS","definition": "Nacimiento Formation, Sandy Shale Facies"}, + {"categories": ["formation_code"],"term": "125RTON","definition": "Raton Formation"}, + {"categories": ["formation_code"],"term": "130CALDFLOOR","definition": "Caldera Floor bedrock S. of San Agustin Plains. Mostly DTILSPRS & Paleo."}, + {"categories": ["formation_code"],"term": "180TKSCC_Upper","definition": "Tertiary-Cretaceous, Sanders Canyon, Cub Mtn. and upper Crevasse Canyon Fm"}, + {"categories": ["formation_code"],"term": "180TKTR","definition": "Tertiary-Cretaceous-Triassic, Baca, Crevasse Cyn, Gallup, Mancos, Dakota, T"}, + {"categories": ["formation_code"],"term": "210CRCS","definition": "Cretaceous System, undivided"}, + {"categories": ["formation_code"],"term": "210GLUPC_Lower","definition": "K Gallup Sandstone and lower Crevasse Canyon Fm"}, + {"categories": ["formation_code"],"term": "210HOSTD","definition": "K Hosta Dalton"}, + {"categories": ["formation_code"],"term": "210MCDK","definition": "K Mancos/Dakota undivided"}, + {"categories": ["formation_code"],"term": "210MNCS","definition": "Mancos Shale, undivided"}, + {"categories": ["formation_code"],"term": "210MNCSL","definition": "K Lower Mancos"}, + {"categories": ["formation_code"],"term": "210MNCSU","definition": "K Upper Mancos"}, + {"categories": ["formation_code"],"term": "211CLFHV","definition": "Cliff House Sandstone, includes La Ventana Tongues in NW Sandoval Co."}, + {"categories": ["formation_code"],"term": "211CRLL","definition": "Carlile Shale"}, + {"categories": ["formation_code"],"term": "211CRVC","definition": "Crevasse Canyon Formation of Mesaverde Group"}, + {"categories": ["formation_code"],"term": "211DKOT","definition": "Dakota Sandstone or Formation"}, + {"categories": ["formation_code"],"term": "211DLCO","definition": "Dilco Coal Member of Crevasse Canyon Formation of Mesaverde Group"}, + {"categories": ["formation_code"],"term": "211DLTN","definition": "Dalton Sandstone Member of Crevasse Canyon Formation of Mesaverde Group"}, + {"categories": ["formation_code"],"term": "211FRHS","definition": "Fort Hays Limestone Member of Niobrara Formation"}, + {"categories": ["formation_code"],"term": "211FRLD","definition": "Fruitland Formation"}, + {"categories": ["formation_code"],"term": "211FRMG","definition": "Farmington Sandstone Member of Kirtland Shale"}, + {"categories": ["formation_code"],"term": "211GBSNC","definition": "Gibson Coal Member of Crevasse Canyon Formation of Mesaverde Group"}, + {"categories": ["formation_code"],"term": "211GLLG","definition": "Gallego Sandstone Member of Gallup Sandstone"}, + {"categories": ["formation_code"],"term": "211GLLP","definition": "Gallup Sandstone"}, + {"categories": ["formation_code"],"term": "211GRRG","definition": "Greenhorn and Graneros Formations"}, + {"categories": ["formation_code"],"term": "211GRRS","definition": "Graneros Shale"}, + {"categories": ["formation_code"],"term": "211HOST","definition": "Hosta Tongue of Point Lookout Sandstone of Mesaverde Group"}, + {"categories": ["formation_code"],"term": "211KRLD","definition": "Kirtland Shale"}, + {"categories": ["formation_code"],"term": "211LWIS","definition": "Lewis Shale"}, + {"categories": ["formation_code"],"term": "211MENF","definition": "Menefee Formation"}, + {"categories": ["formation_code"],"term": "211MENFU","definition": "K Upper Menefee (above Harmon Sandstone)"}, + {"categories": ["formation_code"],"term": "211MVRD","definition": "Mesaverde Group"}, + {"categories": ["formation_code"],"term": "211OJAM","definition": "Ojo Alamo Sandstone"}, + {"categories": ["formation_code"],"term": "211PCCF","definition": "Pictured Cliffs Sandstone"}, + {"categories": ["formation_code"],"term": "211PIRR","definition": "Pierre Shale"}, + {"categories": ["formation_code"],"term": "211PNLK","definition": "Point Lookout Sandstone"}, + {"categories": ["formation_code"],"term": "211SMKH","definition": "Smoky Hill Marl Member"}, + {"categories": ["formation_code"],"term": "211TLLS","definition": "Twowells Sandstone Lentil of Pike of Dakota Sandstone"}, + {"categories": ["formation_code"],"term": "212KTRP","definition": "K Dakota Sandstone, Moenkopi Fm, Artesia Group"}, + {"categories": ["formation_code"],"term": "217PRGR","definition": "Purgatoire Formation"}, + {"categories": ["formation_code"],"term": "220ENRD","definition": "Entrada Sandstone"}, + {"categories": ["formation_code"],"term": "220JURC","definition": "Jurassic undivided"}, + {"categories": ["formation_code"],"term": "220NAVJ","definition": "Navajo Sandstone"}, + {"categories": ["formation_code"],"term": "221BLFF","definition": "Bluff Sandstone of Morrison Formation"}, + {"categories": ["formation_code"],"term": "221CSPG","definition": "Cow Springs Sandstone of Morrison Formation"}, + {"categories": ["formation_code"],"term": "221ERADU","definition": "Entrada Sandstone of San Rafael Group, Upper"}, + {"categories": ["formation_code"],"term": "221MRSN","definition": "Morrison Formation"}, + {"categories": ["formation_code"],"term": "221MRSN/BBSN","definition": "Brushy Basin Member of Morrison"}, + {"categories": ["formation_code"],"term": "221MRSN/JCKP","definition": "Jackpile Sandstone Member of Morrison"}, + {"categories": ["formation_code"],"term": "221MRSN/RCAP","definition": "Recapture Shale Member of Morrison"}, + {"categories": ["formation_code"],"term": "221MRSN/WWCN","definition": "Westwater Canyon Member of Morrison"}, + {"categories": ["formation_code"],"term": "221SLWS","definition": "Salt Wash Sandstone Member of Morrison Formation"}, + {"categories": ["formation_code"],"term": "221SMVL","definition": "Summerville Formation of San Rafael Group"}, + {"categories": ["formation_code"],"term": "221TDLT","definition": "J Todilto"}, + {"categories": ["formation_code"],"term": "221WSRC","definition": "Westwater Canyon Sandstone Member of Morrison Formation"}, + {"categories": ["formation_code"],"term": "221ZUNIS","definition": "Zuni Sandstone"}, + {"categories": ["formation_code"],"term": "231AGZC","definition": "Tr Agua Zarca"}, + {"categories": ["formation_code"],"term": "231AGZCU","definition": "Tr Upper Agua Zarca"}, + {"categories": ["formation_code"],"term": "231CHNL","definition": "Chinle Formation"}, + {"categories": ["formation_code"],"term": "231CORR","definition": "Correo Sandstone Member of Chinle Formation"}, + {"categories": ["formation_code"],"term": "231DCKM","definition": "Dockum Group"}, + {"categories": ["formation_code"],"term": "231PFDF","definition": "Tr Petrified Forest"}, + {"categories": ["formation_code"],"term": "231PFDFL","definition": "Tr Lower Petrified Forest (below middle sandstone)"}, + {"categories": ["formation_code"],"term": "231PFDFM","definition": "Tr Middle Petrified Forest sandstone"}, + {"categories": ["formation_code"],"term": "231PFDFU","definition": "Tr Upper Petrified Forest (above middle sandstone)"}, + {"categories": ["formation_code"],"term": "231RCKP","definition": "Rock Point Member of Wingate Sandstone"}, + {"categories": ["formation_code"],"term": "231SNRS","definition": "Santa Rosa Sandstone"}, + {"categories": ["formation_code"],"term": "231SNSL","definition": "Sonsela Sandstone Bed of Petrified Forest Member of Chinle Formation"}, + {"categories": ["formation_code"],"term": "231SRMP","definition": "Shinarump Member of Chinle Formation"}, + {"categories": ["formation_code"],"term": "231WNGT","definition": "Wingate Sandstone"}, + {"categories": ["formation_code"],"term": "260SNAN","definition": "P San Andres"}, + {"categories": ["formation_code"],"term": "260SNAN_lower","definition": "Lower San Andres Formation"}, + {"categories": ["formation_code"],"term": "261SNGL","definition": "P San Andres - Glorieta Sandstone in Rio Bonito member"}, + {"categories": ["formation_code"],"term": "300YESO","definition": "P Yeso"}, + {"categories": ["formation_code"],"term": "300YESO_lower","definition": "Lower Yeso Formation"}, + {"categories": ["formation_code"],"term": "300YESO_upper","definition": "Upper Yeso Formation"}, + {"categories": ["formation_code"],"term": "310ABO","definition": "P Abo"}, + {"categories": ["formation_code"],"term": "310DCLL","definition": "De Chelly Sandstone Member of Cutler Formation"}, + {"categories": ["formation_code"],"term": "310GLOR","definition": "Glorieta Sandstone Member of San Andres Formation (of Manzano Group)"}, + {"categories": ["formation_code"],"term": "310MBLC","definition": "Meseta Blanca Sandstone Member of Yeso Formation"}, + {"categories": ["formation_code"],"term": "310TRRS","definition": "Torres Member of Yeso Formation"}, + {"categories": ["formation_code"],"term": "310YESO","definition": "Yeso Formation"}, + {"categories": ["formation_code"],"term": "310YESOG","definition": "Yeso Formation, Manzono Group"}, + {"categories": ["formation_code"],"term": "312CSTL","definition": "Castile Formation"}, + {"categories": ["formation_code"],"term": "312RSLR","definition": "Rustler Formation"}, + {"categories": ["formation_code"],"term": "313ARTS","definition": "Artesia Group"}, + {"categories": ["formation_code"],"term": "313BLCN","definition": "Bell Canyon Formation"}, + {"categories": ["formation_code"],"term": "313BRUC","definition": "Brushy Canyon Formation of Delaware Mountain Group"}, + {"categories": ["formation_code"],"term": "313CKBF","definition": "Chalk Bluff Formation"}, + {"categories": ["formation_code"],"term": "313CLBD","definition": "Carlsbad Limestone"}, + {"categories": ["formation_code"],"term": "313CPTN","definition": "Capitan Limestone"}, + {"categories": ["formation_code"],"term": "313GDLP","definition": "Guadalupian Series"}, + {"categories": ["formation_code"],"term": "313GOSP","definition": "Goat Seep Dolomite"}, + {"categories": ["formation_code"],"term": "313SADG","definition": "San Andres Limestone and Glorieta Sandstone"}, + {"categories": ["formation_code"],"term": "313SADR","definition": "San Andres Limestone, undivided"}, + {"categories": ["formation_code"],"term": "313TNSL","definition": "Tansill Formation"}, + {"categories": ["formation_code"],"term": "313YATS","definition": "Yates Formation, Guadalupe Group"}, + {"categories": ["formation_code"],"term": "315LABR","definition": "P Laborcita (Bursum)"}, + {"categories": ["formation_code"],"term": "315YESOABO","definition": "Alamosa Creek and San Agustin Plains area - Yeso and Abo Formations"}, + {"categories": ["formation_code"],"term": "318ABO","definition": "P Abo"}, + {"categories": ["formation_code"],"term": "318BSPG","definition": "Bone Spring Limestone"}, + {"categories": ["formation_code"],"term": "318JOYT","definition": "Joyita Sandstone Member of Yeso Formation"}, + {"categories": ["formation_code"],"term": "318YESO","definition": "Yeso Formation"}, + {"categories": ["formation_code"],"term": "319BRSM","definition": "Bursum Formation and Equivalent Rocks"}, + {"categories": ["formation_code"],"term": "320HLDR","definition": "Penn Holder"}, + {"categories": ["formation_code"],"term": "320PENN","definition": "Pennsylvanian undivided"}, + {"categories": ["formation_code"],"term": "320SNDI","definition": "Sandia Formation"}, + {"categories": ["formation_code"],"term": "321SGDC","definition": "Sangre de Cristo Formation"}, + {"categories": ["formation_code"],"term": "322BEMN","definition": "Penn Beeman"}, + {"categories": ["formation_code"],"term": "325GBLR","definition": "Penn Gobbler"}, + {"categories": ["formation_code"],"term": "325MDER","definition": "Madera Limestone, undivided"}, + {"categories": ["formation_code"],"term": "325MDERL","definition": "Penn Lower Madera"}, + {"categories": ["formation_code"],"term": "325MDERU","definition": "Penn Upper Madera"}, + {"categories": ["formation_code"],"term": "325SAND","definition": "Penn Sandia"}, + {"categories": ["formation_code"],"term": "326MGDL","definition": "Magdalena Group"}, + {"categories": ["formation_code"],"term": "340EPRS","definition": "Espiritu Santo Formation"}, + {"categories": ["formation_code"],"term": "350PZBA","definition": "Alamosa Creek and San Agustin Plains area - Paleozoic strata beneath Abo Fm"}, + {"categories": ["formation_code"],"term": "350PZBB","definition": "Tul Basin area - Paleozoic strata below Bursum Fm"}, + {"categories": ["formation_code"],"term": "400EMBD","definition": "Embudo Granite (undifferentiated PreCambrian near Santa Fe)"}, + {"categories": ["formation_code"],"term": "400PCMB","definition": "Precambrian Erathem"}, + {"categories": ["formation_code"],"term": "400PREC","definition": "undifferentiated PreCambrian crystalline rocks (X)"}, + {"categories": ["formation_code"],"term": "400PRECintr","definition": "PreCambrian crystalline rocks and local Tertiary intrusives"}, + {"categories": ["formation_code"],"term": "400PRST","definition": "Priest Granite"}, + {"categories": ["formation_code"],"term": "400TUSS","definition": "Tusas Granite"}, + {"categories": ["formation_code"],"term": "410PRCG","definition": "PreCambrian granite (Xg)"}, + {"categories": ["formation_code"],"term": "410PRCGf","definition": "PreCambrian granite, fractured (Xgf)"}, + {"categories": ["formation_code"],"term": "410PRCQ","definition": "PreCambrian quartzite (Xq)"}, + {"categories": ["formation_code"],"term": "410PRCQf","definition": "PreCambrian quartzite, fractured (Xqf)"}, + {"categories": ["formation_code"],"term": "121GILA","definition": "Gila Conglomerate (group)"}, + {"categories": ["formation_code"],"term": "312DYLK","definition": "Dewey Lake Redbeds"}, + {"categories": ["formation_code"],"term": "120WMVL","definition": "Wimsattville Formation"}, + {"categories": ["formation_code"],"term": "313GRBG","definition": "Grayburg Formation of Artesia Group"}, + {"categories": ["formation_code"],"term": "318ABOL","definition": "Abo Sandstone (Lower Tongue)"}, + {"categories": ["formation_code"],"term": "318ABOU","definition": "Abo Sandstone (Upper Tongue)"}, + {"categories": ["formation_code"],"term": "112SNTFU","definition": "Santa Fe Group, Upper Part"}, + {"categories": ["formation_code"],"term": "310FRNR","definition": "Forty-Niner Member of Rustler Formation"}, + {"categories": ["formation_code"],"term": "312OCHO","definition": "Ochoan Series"}, + {"categories": ["formation_code"],"term": "313AZOT","definition": "Azotea Tongue of Seven Rivers Formation"}, + {"categories": ["formation_code"],"term": "313QUEN","definition": "Queen Formation"}, + {"categories": ["formation_code"],"term": "319HUCO","definition": "Hueco Limestone"}, + {"categories": ["formation_code"],"term": "313SVRV","definition": "Seven Rivers Formation"}, + {"categories": ["formation_code"],"term": "313CABD","definition": "Carlsbad Group"}, + {"categories": ["formation_code"],"term": "320GRMS","definition": "Gray Mesa Member of Madera Formation"}, + {"categories": ["formation_code"],"term": "211CLRDH","definition": "Colorado Shale"}, + {"categories": ["formation_code"],"term": "120BRLM","definition": "Bearwallow Mountain Andesite"}, + {"categories": ["formation_code"],"term": "122RUBO","definition": "Rubio Peak Formation"}, + {"categories": ["formation_code"],"term": "313SADRL","definition": "San Andres Limestone, Lower Cherty Member"}, + {"categories": ["formation_code"],"term": "313SADRU","definition": "San Andres Limestone, Upper Clastic Member"}, + {"categories": ["formation_code"],"term": "313BRNL","definition": "Bernal Formation of Artesia Group"}, + {"categories": ["formation_code"],"term": "318CPDR","definition": "Chupadera Formation"}, + {"categories": ["formation_code"],"term": "121BDHC","definition": "Bidahochi Formation"}, + {"categories": ["formation_code"],"term": "313SADY","definition": "San Andres Limestone and Yeso Formation, undivided"}, + {"categories": ["formation_code"],"term": "221SRFLL","definition": "San Rafael Group, Lower Part"}, + {"categories": ["formation_code"],"term": "221BLUF","definition": "Bluff Sandstone of Morrison Formation"}, + {"categories": ["formation_code"],"term": "221COSP","definition": "Cow Springs Sandstone of Morrison Formation"}, + {"categories": ["formation_code"],"term": "317ABYS","definition": "Abo and Yeso, undifferentiated"}, + {"categories": ["formation_code"],"term": "221BRSB","definition": "Brushy Basin Shale Member of Morrison Formation"}, + {"categories": ["formation_code"],"term": "310SYDR","definition": "San Ysidro Member of Yeso Formation"}, + {"categories": ["formation_code"],"term": "400SDVL","definition": "Sandoval Granite"}, + {"categories": ["formation_code"],"term": "221SRFL","definition": "San Rafael Group"}, + {"categories": ["formation_code"],"term": "310SGRC","definition": "Sangre de Cristo Formation"}, + {"categories": ["formation_code"],"term": "231TCVS","definition": "Tecovas Formation of Dockum Group"}, + {"categories": ["formation_code"],"term": "211DCRS","definition": "D-Cross Tongue of Mancos Shale of Mesaverde Group"}, + {"categories": ["formation_code"],"term": "211ALSN","definition": "Allison Member of Menefee Formation of Mesaverde Group"}, + {"categories": ["formation_code"],"term": "211LVNN","definition": "La Ventana Tongue of Cliff House Sandstone"}, + {"categories": ["formation_code"],"term": "211MORD","definition": "Madrid Formation"}, + {"categories": ["formation_code"],"term": "210PRMD","definition": "Pyramid Shale"}, + {"categories": ["formation_code"],"term": "124ANMS","definition": "Animas Formation"}, + {"categories": ["formation_code"],"term": "211NBRR","definition": "Niobrara Formation"}, + {"categories": ["formation_code"],"term": "111ALVM","definition": "Holocene Alluvium"}, + {"categories": ["formation_code"],"term": "122SNTFL","definition": "Santa Fe Group, Lower Part"}, + {"categories": ["formation_code"],"term": "111CPLN","definition": "Capulin Basalts"}, + {"categories": ["formation_code"],"term": "120CRSN","definition": "Carson Conflomerate"}, + {"categories": ["formation_code"],"term": "111CRMS","definition": "Covered/Reclaimed Mine Spoil"}, + {"categories": ["formation_code"],"term": "111CRMSA","definition": "Covered/Reclaimed Mine Spoil and Ash"}, + {"categories": ["formation_code"],"term": "111SPOL","definition": "Spoil"}, + {"categories": ["formation_code"],"term": "110TURT","definition": "Tuerto Gravel of Santa Fe Group"}, + {"categories": ["formation_code"],"term": "221RCPR","definition": "Recapture Shale Member of Morrison Formation"}, + {"categories": ["formation_code"],"term": "320BLNG","definition": "Bullington Member of Magdalena Formation"}, + {"categories": ["formation_code"],"term": "112ANCHsr","definition": "Upper Santa Fe Group, Ancha Formation & ancestral Santa Fe river deposits"}, + {"categories": ["formation_code"],"term": "121TSUQae","definition": "Tesuque Fm Lithosomes A and E"}, + {"categories": ["formation_code"],"term": "230TRSC","definition": "Triassic undifferentiated"}, + {"categories": ["formation_code"],"term": "122TSUQdx","definition": "Tesuque Fm, Dixon member (Ttd)"}, + {"categories": ["formation_code"],"term": "123PICSu","definition": "T upper Picuris Formation (Tpu)"}, + {"categories": ["formation_code"],"term": "123PICSm","definition": "T middle Picuris Formation (Tpm)"}, + {"categories": ["formation_code"],"term": "123PICSmc","definition": "T middle conglomerate Picuris Formation (Tpmc)"}, + {"categories": ["formation_code"],"term": "120VBVC","definition": "Tertiary volcanic breccia/volcaniclastic conglomerate"}, + {"categories": ["formation_code"],"term": "120VCSS","definition": "Tertiary volcaniclastic sandstone"}, + {"categories": ["formation_code"],"term": "124DMDT","definition": "Diamond Tail Formation"}, + {"categories": ["formation_code"],"term": "325ALMT","definition": "Penn Alamitos Formation"}, + {"categories": ["formation_code"],"term": "400SAND","definition": "Sandia Granite"}, + {"categories": ["formation_code"],"term": "318VCPK","definition": "Victorio Peak Limestone"}, + {"categories": ["formation_code"],"term": "318BSVP","definition": "Bone Spring and Victorio Peak Limestones"}, + {"categories": ["formation_code"],"term": "100ALVM","definition": "Alluvium"}, + {"categories": ["formation_code"],"term": "310PRMN","definition": "Permian System"}, + {"categories": ["formation_code"],"term": "110AVPS","definition": "Alluvium and Permian System"}, + {"categories": ["formation_code"],"term": "313CRCX","definition": "Capitan Reef Complex and Associated Limestones"}, + {"categories": ["formation_code"],"term": "112SLBL","definition": "Salt Bolson"}, + {"categories": ["formation_code"],"term": "112SBCRC","definition": "Salt Bolson and Capitan Reef Complex"}, + {"categories": ["formation_code"],"term": "313CRDM","definition": "Capitan Reef Complex - Delaware Mountain Group"}, + {"categories": ["formation_code"],"term": "112SBDM","definition": "Salt Bolson and Delaware Mountain Group"}, + {"categories": ["formation_code"],"term": "120BLSN","definition": "Bolson Deposits"}, + {"categories": ["formation_code"],"term": "112SBCR","definition": "Salt Bolson and Cretaceous Rocks"}, + {"categories": ["formation_code"],"term": "112HCBL","definition": "Hueco Bolson"}, + {"categories": ["formation_code"],"term": "120IVIG","definition": "Intrusive Rocks"}, + {"categories": ["formation_code"],"term": "112RLBL","definition": "Red Light Draw Bolson"}, + {"categories": ["formation_code"],"term": "112EFBL","definition": "Eagle Flat Bolson"}, + {"categories": ["formation_code"],"term": "112GRBL","definition": "Green River Bolson"}, + {"categories": ["formation_code"],"term": "123SAND","definition": "Sanders Canyon Formation"}, + {"categories": ["formation_code"],"term": "210MRNH","definition": "Moreno Hill Formation"}, + {"categories": ["formation_code"],"term": "320ALMT","definition": "Alamito Shale"}, + {"categories": ["formation_code"],"term": "313DLRM","definition": "Delaware Mountain Group"}, + {"categories": ["formation_code"],"term": "300PLZC","definition": "Paleozoic Erathem"}, + {"categories": ["formation_code"],"term": "122SPRS","definition": "Spears Member of Datil Formation"}, + {"categories": ["formation_code"],"term": "110AVTV","definition": "Alluvium and Tertiary Volcanics"}, + {"categories": ["formation_code"],"term": "313DMBS","definition": "Delaware Mountain Group - Bone Spring Limestone"}, + {"categories": ["formation_code"],"term": "120ERSV","definition": "Tertiary extrusives"}, + {"categories": ["lithology"],"term": "Alluvium","definition": "Alluvium"}, + {"categories": ["lithology"],"term": "Anhydrite","definition": "Anhydrite"}, + {"categories": ["lithology"],"term": "Arkose","definition": "Arkose"}, + {"categories": ["lithology"],"term": "Boulders","definition": "Boulders"}, + {"categories": ["lithology"],"term": "Boulders, silt and clay","definition": "Boulders, silt and clay"}, + {"categories": ["lithology"],"term": "Boulders and sand","definition": "Boulders and sand"}, + {"categories": ["lithology"],"term": "Bentonite","definition": "Bentonite"}, + {"categories": ["lithology"],"term": "Breccia","definition": "Breccia"}, + {"categories": ["lithology"],"term": "Basalt","definition": "Basalt"}, + {"categories": ["lithology"],"term": "Conglomerate","definition": "Conglomerate"}, + {"categories": ["lithology"],"term": "Chalk","definition": "Chalk"}, + {"categories": ["lithology"],"term": "Chert","definition": "Chert"}, + {"categories": ["lithology"],"term": "Clay","definition": "Clay"}, + {"categories": ["lithology"],"term": "Caliche","definition": "Caliche"}, + {"categories": ["lithology"],"term": "Calcite","definition": "Calcite"}, + {"categories": ["lithology"],"term": "Clay, some sand","definition": "Clay, some sand"}, + {"categories": ["lithology"],"term": "Claystone","definition": "Claystone"}, + {"categories": ["lithology"],"term": "Coal","definition": "Coal"}, + {"categories": ["lithology"],"term": "Cobbles","definition": "Cobbles"}, + {"categories": ["lithology"],"term": "Cobbles, silt and clay","definition": "Cobbles, silt and clay"}, + {"categories": ["lithology"],"term": "Cobbles and sand","definition": "Cobbles and sand"}, + {"categories": ["lithology"],"term": "Dolomite","definition": "Dolomite"}, + {"categories": ["lithology"],"term": "Dolomite and shale","definition": "Dolomite and shale"}, + {"categories": ["lithology"],"term": "Evaporite","definition": "Evaporite"}, + {"categories": ["lithology"],"term": "Gneiss","definition": "Gneiss"}, + {"categories": ["lithology"],"term": "Gypsum","definition": "Gypsum"}, + {"categories": ["lithology"],"term": "Graywacke","definition": "Graywacke"}, + {"categories": ["lithology"],"term": "Gravel and clay","definition": "Gravel and clay"}, + {"categories": ["lithology"],"term": "Gravel, cemented","definition": "Gravel, cemented"}, + {"categories": ["lithology"],"term": "Gravel, sand and silt","definition": "Gravel, sand and silt"}, + {"categories": ["lithology"],"term": "Granite, gneiss","definition": "Granite, gneiss"}, + {"categories": ["lithology"],"term": "Granite","definition": "Granite"}, + {"categories": ["lithology"],"term": "Gravel, silt and clay","definition": "Gravel, silt and clay"}, + {"categories": ["lithology"],"term": "Gravel","definition": "Gravel"}, + {"categories": ["lithology"],"term": "Igneous undifferentiated","definition": "Igneous undifferentiated"}, + {"categories": ["lithology"],"term": "Lignite","definition": "Lignite"}, + {"categories": ["lithology"],"term": "Limestone and dolomite","definition": "Limestone and dolomite"}, + {"categories": ["lithology"],"term": "Limestone and shale","definition": "Limestone and shale"}, + {"categories": ["lithology"],"term": "Limestone","definition": "Limestone"}, + {"categories": ["lithology"],"term": "Marl","definition": "Marl"}, + {"categories": ["lithology"],"term": "Mudstone","definition": "Mudstone"}, + {"categories": ["lithology"],"term": "Metamorphic undifferentiated","definition": "Metamorphic undifferentiated"}, + {"categories": ["lithology"],"term": "Marlstone","definition": "Marlstone"}, + {"categories": ["lithology"],"term": "No Recovery","definition": "No Recovery"}, + {"categories": ["lithology"],"term": "Peat","definition": "Peat"}, + {"categories": ["lithology"],"term": "Quartzite","definition": "Quartzite"}, + {"categories": ["lithology"],"term": "Rhyolite","definition": "Rhyolite"}, + {"categories": ["lithology"],"term": "Sand","definition": "Sand"}, + {"categories": ["lithology"],"term": "Schist","definition": "Schist"}, + {"categories": ["lithology"],"term": "Sand and clay","definition": "Sand and clay"}, + {"categories": ["lithology"],"term": "Sand and gravel","definition": "Sand and gravel"}, + {"categories": ["lithology"],"term": "Sandstone and shale","definition": "Sandstone and shale"}, + {"categories": ["lithology"],"term": "Sand and silt","definition": "Sand and silt"}, + {"categories": ["lithology"],"term": "Sand, gravel and clay","definition": "Sand, gravel and clay"}, + {"categories": ["lithology"],"term": "Shale","definition": "Shale"}, + {"categories": ["lithology"],"term": "Silt","definition": "Silt"}, + {"categories": ["lithology"],"term": "Siltstone and shale","definition": "Siltstone and shale"}, + {"categories": ["lithology"],"term": "Siltstone","definition": "Siltstone"}, + {"categories": ["lithology"],"term": "Slate","definition": "Slate"}, + {"categories": ["lithology"],"term": "Sand, some clay","definition": "Sand, some clay"}, + {"categories": ["lithology"],"term": "Sandstone","definition": "Sandstone"}, + {"categories": ["lithology"],"term": "Silt and clay","definition": "Silt and clay"}, + {"categories": ["lithology"],"term": "Travertine","definition": "Travertine"}, + {"categories": ["lithology"],"term": "Tuff","definition": "Tuff"}, + {"categories": ["lithology"],"term": "Volcanic undifferentiated","definition": "Volcanic undifferentiated"}, + {"categories": ["lithology"],"term": "Clay, yellow","definition": "Clay, yellow"}, + {"categories": ["lithology"],"term": "Clay, red","definition": "Clay, red"}, + {"categories": ["lithology"],"term": "Surficial sediment","definition": "Surficial sediment"}, + {"categories": ["lithology"],"term": "Limestone and sandstone, interbedded","definition": "Limestone and sandstone, interbedded"}, + {"categories": ["lithology"],"term": "Gravel and boulders","definition": "Gravel and boulders"}, + {"categories": ["lithology"],"term": "Sand, silt and gravel","definition": "Sand, silt and gravel"}, + {"categories": ["lithology"],"term": "Sand, gravel, silt and clay","definition": "Sand, gravel, silt and clay"}, + {"categories": ["lithology"],"term": "Andesite","definition": "Andesite"}, + {"categories": ["lithology"],"term": "Ignesous, intrusive, undifferentiated","definition": "Ignesous, intrusive, undifferentiated"}, + {"categories": ["lithology"],"term": "Limestone, sandstone and shale","definition": "Limestone, sandstone and shale"}, + {"categories": ["lithology"],"term": "Sand, silt and clay","definition": "Sand, silt and clay"}, {"categories": ["origin_source"], "term": "Reported by another agency", "definition": "Reported by another agency"}, {"categories": ["origin_source"], "term": "From driller's log or well report", "definition": "From driller's log or well report"}, {"categories": ["origin_source"], "term": "Private geologist, consultant or univ associate", "definition": "Private geologist, consultant or univ associate"}, diff --git a/services/util.py b/services/util.py index e9ec08a94..a3ddcf472 100644 --- a/services/util.py +++ b/services/util.py @@ -1,14 +1,13 @@ import json import os -from shapely.ops import transform -import pyproj + import httpx -from sqlalchemy.orm import DeclarativeBase +import pyproj +from shapely.ops import transform from sqlalchemy.orm import DeclarativeBase from constants import SRID_WGS84 - TRANSFORMERS = {} METERS_TO_FEET = 3.28084 @@ -151,6 +150,7 @@ def get_epqs_elevation_from_point(lon: float, lat: float) -> float | None: try: data = resp.json() except json.decoder.JSONDecodeError: + print(f"Error decoding JSON from EPQS: {resp.text}") return None return data["value"] diff --git a/transfers/transfer.py b/transfers/transfer.py index 8a9c3bed3..bf0c69b85 100644 --- a/transfers/transfer.py +++ b/transfers/transfer.py @@ -17,7 +17,10 @@ from dotenv import load_dotenv +from db.engine import session_ctx from services.util import get_bool_env +from transfers.aquifer_system_transfer import transfer_aquifer_systems +from transfers.geologic_formation_transfer import transfer_geologic_formations load_dotenv() @@ -60,6 +63,10 @@ def transfer_all(metrics, limit=100): flags = {"TRANSFER_ALL_WELLS": True, "LIMIT": limit} # not currently used + with session_ctx() as session: + transfer_aquifer_systems(session, limit=limit) + transfer_geologic_formations(session, limit=limit) + message("TRANSFERRING WELLS") results = _execute_transfer(WellTransferer, flags=flags) metrics.well_metrics(*results) diff --git a/transfers/transferer.py b/transfers/transferer.py index a8045dccb..4312051fd 100644 --- a/transfers/transferer.py +++ b/transfers/transferer.py @@ -70,6 +70,7 @@ def _limit_iterator(self, session: Session, limit: int, step: int = 25): df = self._get_df_to_iterate() n = len(df) start_time = time.time() + logger.info(f"Starting transfer of {n} [limit={limit}] rows") for i, row in enumerate(df.itertuples()): if limit and i >= limit: logger.info(f"Reached limit of {limit} rows. Stopping migration.") diff --git a/transfers/util.py b/transfers/util.py index 24389bc6d..68fe890ec 100644 --- a/transfers/util.py +++ b/transfers/util.py @@ -65,7 +65,6 @@ def estimate_measuring_point_height( ) -> tuple[float, str, datetime | None]: mph = row.MPHeight mph_desc = row.MeasuringPoint - df = self._df[self._df["PointID"] == row.PointID] df = df.sort_values("DateMeasured") if mph is None: @@ -327,9 +326,19 @@ def get_transferable_wells( wellphotos_df = read_csv("WellPhotos") wellphotos_pointids = wellphotos_df["PointID"].unique().tolist() + pointids = list(set(usgs_pointids + collabnet_pointids + wellphotos_pointids)) + logger.info(f"total pointids: {len(pointids)} {pointids[:10]}") + # get all pointids that have owner info + ownerlinks_df = read_csv("OwnerLink") + locdf = read_csv("Location") + + ownerlinks_df = ownerlinks_df.join(locdf.set_index("LocationId"), on="LocationId") + ownerlinks_pointids = ownerlinks_df["PointID"].unique().tolist() + ownerpointids = list(set(ownerlinks_pointids) - set(pointids)) + logger.info(f"ownerpointids: {len(ownerpointids)} {ownerpointids[:10]}") + pointids = pointids + ownerpointids - pointids = list(set(usgs_pointids + collabnet_pointids + wellphotos_pointids)) return df[df["DataSource"].isin(valid_datasources) | df["PointID"].isin(pointids)] @@ -442,7 +451,7 @@ def make_location(row: pd.Series, elevations: dict) -> tuple: else: elevation_from_epqs = True logger.info( - f"Location {row.PointID} has no Altitude. Setting from National Map EPQS for " + f"Location {row.PointID} has no Altitude. Setting from National Map EPQS. " ) z = get_epqs_elevation_from_point(transformed_point.x, transformed_point.y) diff --git a/transfers/well_transfer.py b/transfers/well_transfer.py index eed5c3eb8..9c3c88ec9 100644 --- a/transfers/well_transfer.py +++ b/transfers/well_transfer.py @@ -14,14 +14,14 @@ # limitations under the License. # =============================================================================== import json - -# import time +import re +import time from datetime import datetime, UTC -# import re import pandas as pd -from pandas import isna +from pandas import isna, notna from pydantic import ValidationError +from sqlalchemy.exc import DatabaseError from sqlalchemy.orm import Session from core.enums import ( @@ -38,11 +38,11 @@ StatusHistory, MonitoringFrequencyHistory, MeasuringPointHistory, - # DataProvenance, - # AquiferSystem, - # AquiferType, - # GeologicFormation, - # ThingAquiferAssociation, + DataProvenance, + AquiferSystem, + AquiferType, + GeologicFormation, + ThingAquiferAssociation, ) from schemas.thing import CreateWell, CreateWellScreen from services.gcs_helper import get_storage_bucket @@ -125,35 +125,130 @@ def _extract_casing_materials(row) -> list[str]: return materials -# def get_wells_to_transfer(flags: dict = None) -> tuple[pd.DataFrame, pd.DataFrame]: -# # if flags is None: -# # flags = {} -# -# wdf = read_csv("WellData", dtype={"OSEWelltagID": str}) -# ldf = read_csv("Location") -# ldf = ldf.drop(["PointID", "SSMA_TimeStamp"], axis=1) -# wdf = wdf.join(ldf.set_index("LocationId"), on="LocationId") -# wdf = wdf[wdf["SiteType"] == "GW"] -# wdf = wdf[wdf["Easting"].notna() & wdf["Northing"].notna()] -# -# input_df = wdf -# wdf = replace_nans(wdf) -# -# # if flags.get("TRANSFER_ALL_WELLS", False): -# # # todo: filter Locations by DataSource -# # cleaned_df = filter_by_welldata_datasource_and_project(wdf) -# # else: -# # # get a subset of wells that have not been transferred yet -# # # todo: this needs to be defined. -# # # for now, we are just filtering out wells that have not been transferred yet -# # # In the future we will be using criteria to determine which wells to transfer -# # # for example, wells in the "Water Level Network" project -# # cleaned_df = wdf -# -# cleaned_df = get_transferable_wells(wdf) -# cleaned_df = filter_non_transferred_wells(cleaned_df) -# -# return input_df, cleaned_df +pattern = re.compile( + r"\b(?Pjet|hand|submersible)\b|\b(?Pline[-\s]+shaft)\b", re.IGNORECASE +) + + +def first_matched_term(text: str): + m = pattern.search(text) + if not m: + return None + return m.group("term") or m.group("phrase") + + +PUMP_MAPPING = {"jet": "Jet", "hand": "Hand", "submersible": "Submersible"} + + +def _extract_well_pump_type(row) -> str | None: + if isna(row.ConstructionNotes): + return None + construction_notes = row.ConstructionNotes.lower() + return PUMP_MAPPING.get(first_matched_term(construction_notes), None) + + +# Parse aquifer codes +def _extract_aquifer_type_codes(aquifer_code: str) -> list[str]: + """ + Parse aquifer type codes that may contain multiple values. + + Args: + aquifer_code: Raw code from AquiferType field + + Returns: + List of individual codes + """ + if not aquifer_code: + return [] + # clean the code + code = aquifer_code.strip().upper() + # split into individual characters. This handles cases like "FC" -> ["F", "C"] + individual_codes = list(code) + return individual_codes + + +# Get or create aquifer system +def get_or_create_aquifer_system( + session: Session, aquifer_name: str, primary_type: str +) -> AquiferSystem | None: + """ + Get existing aquifer or create new one if it doesn't exist. + + With the new AquiferType model, we create ONE aquifer record per named + aquifer (e.g., one "Santa Fe Group"), not multiple variants. + + Args: + session: Database session + aquifer_name: Name of the aquifer (from AqClass or type name) + primary_type: Primary aquifer type for the aquifer_type field + """ + # Try to find existing aquifer by name + aquifer = ( + session.query(AquiferSystem).filter(AquiferSystem.name == aquifer_name).first() + ) + + if aquifer: + return aquifer + + # Create new aquifer + try: + logger.info( + f"Creating new aquifer system: {aquifer_name} (primary type: {primary_type})" + ) + + aquifer = AquiferSystem( + name=aquifer_name, + primary_aquifer_type=primary_type, # Primary type + geographic_scale=None, # Default + ) + session.add(aquifer) + session.commit() + # session.flush() # Get the ID + # session.refresh(aquifer) + return aquifer + except DatabaseError as e: + session.rollback() + logger.critical(f"Error creating aquifer {aquifer_name}: {e}") + return None + + +def get_or_create_geologic_formation( + session: Session, formation_code: str +) -> GeologicFormation | None: + """ + Get existing geologic formation or create new one if it doesn't exist. + + Args: + session: Database session + formation_code: The formation code from FormationZone field + + Returns: + GeologicFormation object or None if creation fails + """ + # Try to find existing formation + formation = ( + session.query(GeologicFormation) + .filter(GeologicFormation.formation_code == formation_code) + .first() + ) + + if formation: + return formation + + # If not found, create new formation + try: + logger.info(f"Creating new geologic formation: {formation_code}") + formation = GeologicFormation( + formation_code=formation_code, + description=None, + lithology=None, + ) + session.add(formation) + session.flush() + return formation + except Exception as e: + logger.critical(f"Error creating formation {formation_code}: {e}") + return None def get_cached_elevations() -> dict: @@ -222,13 +317,15 @@ def _step(self, session: Session, df: pd.DataFrame, i: int, row: pd.Series): try: location, elevation_method = make_location(row, self._cached_elevations) session.add(location) + session.commit() self._added_locations[row.PointID] = elevation_method except Exception as e: + self._capture_error(row.PointID, str(e), str(e), "Location") + logger.critical(f"Error making location for {row.PointID}: {e}") + if location is not None: session.expunge(location) - self._capture_error(row.PointID, str(e), str(e), "Location") - logger.critical(f"Error making location for {row.PointID}: {e}") return try: @@ -237,6 +334,7 @@ def _step(self, session: Session, df: pd.DataFrame, i: int, row: pd.Series): well_casing_materials = ( [] if isna(row.CasingDescription) else _extract_casing_materials(row) ) + well_pump_type = _extract_well_pump_type(row) # manually add the well rather than add_well from services/thing_helper.py # so that effective_start can be set on the location assocation @@ -253,13 +351,26 @@ def _step(self, session: Session, df: pd.DataFrame, i: int, row: pd.Series): ), well_casing_depth=row.CasingDepth, release_status="public" if row.PublicRelease else "private", - measuring_point_height=0, - measuring_point_description="", - # measuring_point_height=row.MPHeight, - # measuring_point_description=row.MeasuringPoint, + measuring_point_height=row.MPHeight, + measuring_point_description=row.MeasuringPoint, notes=( [{"content": row.Notes, "note_type": "Other"}] if row.Notes else [] ), + well_completion_date=row.CompletionDate, + well_driller_name=row.DrillerName, + well_construction_method=( + lexicon_mapper.map_value( + f"LU_ConstructionMethod:{row.ConstructionMethod}" + ) + if not isna(row.ConstructionMethod) + else None + ), + well_pump_type=well_pump_type, + is_suitable_for_datalogger=( + bool(row.OpenWellLoggerOK) + if not isna(row.OpenWellLoggerOK) + else None + ), ) CreateWell.model_validate(data) @@ -280,6 +391,8 @@ def _step(self, session: Session, df: pd.DataFrame, i: int, row: pd.Series): "well_casing_materials", "measuring_point_height", "measuring_point_description", + "well_completion_date_source", + "well_construction_method_source", ] ) well_data["thing_type"] = "water well" @@ -288,17 +401,6 @@ def _step(self, session: Session, df: pd.DataFrame, i: int, row: pd.Series): well_data.pop("notes") well = Thing(**well_data) session.add(well) - # logger.info(f"Created well for {row.PointID}") - - # flush well to access its ID for status_history - # session.flush() - - # session.commit() - # session.refresh(well) - # if notes: - # for ni in notes: - # nn = well.add_note(ni['content'], ni['note_type']) - # session.add(nn) if well_purposes: for wp in well_purposes: @@ -334,13 +436,162 @@ def _step(self, session: Session, df: pd.DataFrame, i: int, row: pd.Series): assoc.thing = well session.add(assoc) + if isna(row.AquiferType): + logger.info( + f"No AquiferType for {well.name}. Skipping aquifer association." + ) + else: + try: + self._add_aquifers(session, row, well) + except Exception as e: + logger.critical( + f"Error creating aquifer association for {well.name}: {e}" + ) + + if isna(row.FormationZone): + logger.info( + f"No FormationZone for {well.name}. Skipping formation association." + ) + else: + try: + self._add_formation_zone(session, row, well) + except Exception as e: + logger.critical( + f"Error creating formation association for {well.name}: {e}" + ) + + def _add_formation_zone(self, session, row, well): + # --- Set Formation Completion (NOT depth-based stratigraphy) --- + # This simply records which formation the well was completed in. + # For detailed depth-interval stratigraphy, see stratigraphy_transfer.py + + formation_code = row.FormationZone + + # Validate formation exists + formation = ( + session.query(GeologicFormation) + .filter(GeologicFormation.formation_code == formation_code) + .first() + ) + + if formation: + # Formation exists: Set association + well.formation_completion_code = formation_code + logger.info(f"Set completion formation for {well.name}: {formation_code}") + else: + # Formation does NOT exist: Do not create new formation. Flag and log for review + logger.critical( + f"MISSING FORMATION: Formation '{formation_code}' not found for well {well.name}. Flagged for review." + ) + self._capture_error( + row.PointID, f"Unknown formation: {formation_code}", "FormationZone" + ) + + def _add_aquifers(self, session, row, well): + # Parse codes (handles multi-character codes like "FC") + aquifer_codes = _extract_aquifer_type_codes(row.AquiferType) + + if not aquifer_codes: + logger.warning( + f"Well {row.PointID}: Empty aquifer codes after parsing '{row.AquiferType}'" + ) + return + + # Map AqClass code to aquifer name using lexicon mapper + if isna(row.AqClass): + # No AqClass - use first code's mapped name as aquifer name + aquifer_name = lexicon_mapper.map_value( + f"LU_AquiferType:{aquifer_codes[0]}" + ) + else: + try: + aquifer_name = lexicon_mapper.map_value( + f"LU_AquiferClass:{row.AqClass}" + ) + except KeyError: + logger.warning( + f"Unknown AqClass code '{row.AqClass}' for well {row.PointID}, using first type as name" + ) + aquifer_name = lexicon_mapper.map_value( + f"LU_AquiferType:{aquifer_codes[0]}" + ) + + # Determine primary type + # This assumes the first recorded type of a compound type is the primary type of the aquifer. + # TODO: verify with AMMP + try: + primary_type = lexicon_mapper.map_value( + f"LU_AquiferType:{aquifer_codes[0]}" + ) + except KeyError: + logger.warning( + f"Unknown aquifer type code '{aquifer_codes[0]}' for well {row.PointID}." + f"Setting primary_type to 'Unknown'" + ) + primary_type = "Unknown" # Creates aquifer with placeholder + + # Get or create the aquifer + aquifer = get_or_create_aquifer_system(session, aquifer_name, primary_type) + logger.info(f"working with {aquifer}, {aquifer.id}") + if aquifer: + # Check if association already exists + existing_assoc = ( + session.query(ThingAquiferAssociation) + .filter( + ThingAquiferAssociation.thing_id == well.id, + ThingAquiferAssociation.aquifer_system_id == aquifer.id, + ) + .first() + ) + + if not existing_assoc: + # Create the association + logger.info(f"Associating well {well.name} with aquifer {aquifer.name}") + aquifer_assoc = ThingAquiferAssociation( + thing=well, aquifer_system=aquifer + ) + session.add(aquifer_assoc) + session.flush() + + # Create AquiferType records for EACH characteristic + aquifer_type_names = [] + for aquifer_code in aquifer_codes: + try: + type_name = lexicon_mapper.map_value( + f"LU_AquiferType:{aquifer_code}" + ) + aquifer_type = AquiferType( + thing_aquifer_association=aquifer_assoc, + aquifer_type=type_name, + ) + session.add(aquifer_type) + aquifer_type_names.append(type_name) + except KeyError: + logger.critical( + f"Unknown aquifer code '{aquifer_code}' from AquiferType='{row.AquiferType}' " + f"for well {well.name}. Skipping this code." + ) + self._capture_error( + row.PointID, + f"Unknown aquifer code: {aquifer_code}", + "AquiferType", + ) + + logger.info( + f"Associated well {well.name} with aquifer {aquifer.name} " + f"(types: {', '.join(aquifer_type_names)})" + ) + def _after_hook(self, session): dump_cached_elevations(self._cached_elevations) measuring_point_estimator = MeasuringPointEstimator() # add things thate need well id - for well in session.query(Thing).filter(Thing.thing_type == "water well").all(): + query = session.query(Thing).filter(Thing.thing_type == "water well") + count = query.count() + for i, well in enumerate(query.all()): + step_start_time = time.time() row = self.cleaned_df[self.cleaned_df["PointID"] == well.name].iloc[0] - if not isna(row.Notes): + if notna(row.Notes): note = well.add_note(row.Notes, "Other") session.add(note) @@ -352,8 +603,49 @@ def _after_hook(self, session): for dp in data_provenances: session.add(dp) - mphs = measuring_point_estimator.estimate_measuring_point_height(row) + for row_field, kw in ( + ( + "CompletionSource", + dict( + field_name="well_completion_date", + origin_type=lexicon_mapper.map_value( + f"LU_Depth_CompletionSource:{row.CompletionSource}" + ), + ), + ), + ( + "DataSource", + dict( + field_name="well_construction_method", + origin_source=row.DataSource, + ), + ), + ( + "DepthSource", + dict( + field_name="well_depth", + origin_type=lexicon_mapper.map_value( + f"LU_Depth_CompletionSource:{row.DepthSource}" + ), + ), + ), + ): + if notna(row[row_field]): + try: + dp = DataProvenance( + target_id=well.id, target_table="thing", **kw + ) + session.add(dp) + session.commit() + except DatabaseError as e: + self._capture_error(row.PointID, str(e), "DataProvenance") + session.rollback() + start_time = time.time() + mphs = measuring_point_estimator.estimate_measuring_point_height(row) + logger.info( + f"Estimated measuring point heights for {well.name}: {time.time() - start_time:.2f}s" + ) for mph, mph_desc, start_date, end_date in mphs: measuring_point_history = MeasuringPointHistory( thing_id=well.id, @@ -377,7 +669,7 @@ def _after_hook(self, session): target_id = well.id target_table = "thing" - if not isna(row.MonitoringStatus): + if notna(row.MonitoringStatus): if ( "X" in row.MonitoringStatus or "I" in row.MonitoringStatus @@ -414,7 +706,7 @@ def _after_hook(self, session): f" Adding '{monitoring_frequency}' monitoring frequency for well {well.name}" ) - if not isna(row.Status): + if notna(row.Status): status_value = lexicon_mapper.map_value(f"LU_Status:{row.Status}") status_history = StatusHistory( status_type="Well Status", @@ -427,6 +719,10 @@ def _after_hook(self, session): session.add(status_history) logger.info(f" Added well status for well {well.name}: {status_value}") + logger.info( + f"After hook: {well.name} {i+1}/{count} took {time.time() - step_start_time:.2f}s" + ) + session.commit() From 308a7ca773d8ea63504ea9bfe79290fc23ca69b7 Mon Sep 17 00:00:00 2001 From: jakeross Date: Tue, 2 Dec 2025 21:11:22 -0700 Subject: [PATCH 32/66] fix: enable database rebuild and update measuring point history to include reason --- tests/features/environment.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/features/environment.py b/tests/features/environment.py index afbc2d13c..13bcdead3 100644 --- a/tests/features/environment.py +++ b/tests/features/environment.py @@ -497,7 +497,7 @@ def add_geologic_formation(context, session, formation_code, well): def before_all(context): context.objects = {} rebuild = False - # rebuild = True + rebuild = True if rebuild: erase_and_rebuild_db() @@ -539,7 +539,7 @@ def before_all(context): for well in (well_1, well_2, well_3): add_measuring_point_history(context, session, well=well) - for value, start, end in ( + for value, start, end, reason in ( ( "Active, pumping well", datetime(2020, 1, 1), @@ -560,7 +560,7 @@ def before_all(context): status_value=value, start_date=start, end_date=end, - reason="Initial status", + reason=reason, target_id=context.objects["wells"][0].id, target_table="thing", ) From ab5a600fe5639d1e555612edea9b772f2a924200 Mon Sep 17 00:00:00 2001 From: jakeross Date: Tue, 2 Dec 2025 21:29:19 -0700 Subject: [PATCH 33/66] refactor: remove unnecessary return statements and logging for clarity in transfer processes --- transfers/waterlevels_transfer.py | 1 - transfers/well_transfer.py | 1 - 2 files changed, 2 deletions(-) diff --git a/transfers/waterlevels_transfer.py b/transfers/waterlevels_transfer.py index 80b8a4bd8..270592a66 100644 --- a/transfers/waterlevels_transfer.py +++ b/transfers/waterlevels_transfer.py @@ -283,7 +283,6 @@ def _get_field_event_participants(self, session, row, thing) -> list[Contact]: logger.critical( f"No contacts can be associated with the WaterLevels record with GlobalID {row.GlobalID}, therefore no field event, field activity, sample, and observation can be made. Skipping." ) - return None return field_event_participants diff --git a/transfers/well_transfer.py b/transfers/well_transfer.py index 9c3c88ec9..314593250 100644 --- a/transfers/well_transfer.py +++ b/transfers/well_transfer.py @@ -532,7 +532,6 @@ def _add_aquifers(self, session, row, well): # Get or create the aquifer aquifer = get_or_create_aquifer_system(session, aquifer_name, primary_type) - logger.info(f"working with {aquifer}, {aquifer.id}") if aquifer: # Check if association already exists existing_assoc = ( From 5ade1b2d3b7b71e565384c3ff252afc056332778 Mon Sep 17 00:00:00 2001 From: jakeross Date: Tue, 2 Dec 2025 21:50:50 -0700 Subject: [PATCH 34/66] refactor: optimize date handling in deployment search logic for improved clarity --- transfers/waterlevels_transducer_transfer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/transfers/waterlevels_transducer_transfer.py b/transfers/waterlevels_transducer_transfer.py index 927d8d6b8..74eaafd06 100644 --- a/transfers/waterlevels_transducer_transfer.py +++ b/transfers/waterlevels_transducer_transfer.py @@ -195,12 +195,12 @@ class WaterLevelsContinuousAcousticTransferer(WaterLevelsContinuousTransferer): def _find_deployment(ts, deployments): + date = ts.date() for d in deployments: - start = Timestamp(d.installation_date) - if start > ts: + if d.installation_date > date: break # because sorted by start - end = Timestamp(d.removal_date) if d.removal_date else Timestamp.max - if end >= ts: + end = d.removal_date if d.removal_date else Timestamp.max.date() + if end >= date: return d return None From 306dabcd655621e1882b0e1a75406bc243783e75 Mon Sep 17 00:00:00 2001 From: kbighorse Date: Wed, 3 Dec 2025 06:41:26 +0000 Subject: [PATCH 35/66] Formatting changes --- tests/test_transfer_legacy_dates.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/test_transfer_legacy_dates.py b/tests/test_transfer_legacy_dates.py index aa054740c..05dbe8dfe 100644 --- a/tests/test_transfer_legacy_dates.py +++ b/tests/test_transfer_legacy_dates.py @@ -35,7 +35,7 @@ # ============================================================================ -@patch('transfers.util.lexicon_mapper') +@patch("transfers.util.lexicon_mapper") def test_make_location_with_both_legacy_dates(mock_lexicon_mapper): """Test that make_location populates both legacy_date_created and legacy_site_date""" # Mock lexicon mapper to avoid GCS calls @@ -77,7 +77,7 @@ def test_make_location_with_both_legacy_dates(mock_lexicon_mapper): assert location.created_at is None -@patch('transfers.util.lexicon_mapper') +@patch("transfers.util.lexicon_mapper") def test_make_location_with_only_date_created(mock_lexicon_mapper): """Test that make_location handles locations with only DateCreated (no SiteDate)""" # Mock lexicon mapper to avoid GCS calls @@ -111,7 +111,7 @@ def test_make_location_with_only_date_created(mock_lexicon_mapper): assert location.legacy_site_date is None -@patch('transfers.util.lexicon_mapper') +@patch("transfers.util.lexicon_mapper") def test_make_location_with_site_date_later_than_date_created(mock_lexicon_mapper): """Test data anomaly: SiteDate is later than DateCreated (should still be accepted)""" # Mock lexicon mapper to avoid GCS calls @@ -143,7 +143,7 @@ def test_make_location_with_site_date_later_than_date_created(mock_lexicon_mappe assert location.legacy_site_date == datetime.date(2015, 6, 20) -@patch('transfers.util.lexicon_mapper') +@patch("transfers.util.lexicon_mapper") def test_make_location_with_very_old_site_date(mock_lexicon_mapper): """Test that very old SiteDates (1950s) are preserved correctly""" # Mock lexicon mapper to avoid GCS calls @@ -179,7 +179,7 @@ def test_make_location_with_very_old_site_date(mock_lexicon_mapper): assert time_gap == 19751 # Approximately 54 years -@patch('transfers.util.lexicon_mapper') +@patch("transfers.util.lexicon_mapper") def test_make_location_legacy_dates_are_date_not_datetime(mock_lexicon_mapper): """Test that legacy date fields are Date type (not DateTime)""" # Mock lexicon mapper to avoid GCS calls @@ -218,7 +218,7 @@ def test_make_location_legacy_dates_are_date_not_datetime(mock_lexicon_mapper): assert location.legacy_site_date == datetime.date(2002, 12, 10) -@patch('transfers.util.lexicon_mapper') +@patch("transfers.util.lexicon_mapper") def test_make_location_legacy_dates_independent_of_created_at(mock_lexicon_mapper): """Test that legacy dates don't affect created_at timestamp""" # Mock lexicon mapper to avoid GCS calls @@ -347,7 +347,7 @@ def test_create_well_completed_on_is_date_not_datetime(): # ============================================================================ -@patch('transfers.util.lexicon_mapper') +@patch("transfers.util.lexicon_mapper") def test_location_legacy_date_coverage_statistics(mock_lexicon_mapper): """Test that migration preserves expected percentages of legacy dates""" # Mock lexicon mapper to avoid GCS calls From d8167a7e94c8687f01e3092912077a3dde618f1c Mon Sep 17 00:00:00 2001 From: Kimball Bighorse Date: Tue, 2 Dec 2025 22:41:37 -0800 Subject: [PATCH 36/66] Resolve test failures --- tests/test_thing.py | 7 ++--- tests/test_transfer_legacy_dates.py | 44 +++++++++++++++++++++++------ 2 files changed, 39 insertions(+), 12 deletions(-) diff --git a/tests/test_thing.py b/tests/test_thing.py index eaa541668..94d00aa85 100644 --- a/tests/test_thing.py +++ b/tests/test_thing.py @@ -1207,7 +1207,7 @@ def test_create_well_without_completion_date(location): def test_spring_well_completed_on_is_null(location): - """Test that springs have null well_completed_on field""" + """Test that springs do NOT have well_completed_on field (it's well-specific)""" payload = { "name": "Test Spring", "location_id": location.id, @@ -1218,9 +1218,8 @@ def test_spring_well_completed_on_is_null(location): assert response.status_code == 201 data = response.json() - # Springs should have null well_completed_on - assert "well_completed_on" in data - assert data["well_completed_on"] is None + # Springs should NOT have well_completed_on field (only wells have completion dates) + assert "well_completed_on" not in data assert data["thing_type"] == "spring" # cleanup after test diff --git a/tests/test_transfer_legacy_dates.py b/tests/test_transfer_legacy_dates.py index 30fbcd5ae..aa054740c 100644 --- a/tests/test_transfer_legacy_dates.py +++ b/tests/test_transfer_legacy_dates.py @@ -22,7 +22,7 @@ 3. Thing.well_completed_on is populated from CSV CompletionDate (if not null) """ import datetime -from unittest.mock import Mock, patch +from unittest.mock import Mock, patch, MagicMock import pandas as pd import pytest @@ -35,8 +35,12 @@ # ============================================================================ -def test_make_location_with_both_legacy_dates(): +@patch('transfers.util.lexicon_mapper') +def test_make_location_with_both_legacy_dates(mock_lexicon_mapper): """Test that make_location populates both legacy_date_created and legacy_site_date""" + # Mock lexicon mapper to avoid GCS calls + mock_lexicon_mapper.map_value.return_value = "GPS" + # Create a mock CSV row with both DateCreated and SiteDate row = pd.Series( { @@ -73,8 +77,12 @@ def test_make_location_with_both_legacy_dates(): assert location.created_at is None -def test_make_location_with_only_date_created(): +@patch('transfers.util.lexicon_mapper') +def test_make_location_with_only_date_created(mock_lexicon_mapper): """Test that make_location handles locations with only DateCreated (no SiteDate)""" + # Mock lexicon mapper to avoid GCS calls + mock_lexicon_mapper.map_value.return_value = "GPS" + row = pd.Series( { "PointID": "TEST-002", @@ -103,8 +111,12 @@ def test_make_location_with_only_date_created(): assert location.legacy_site_date is None -def test_make_location_with_site_date_later_than_date_created(): +@patch('transfers.util.lexicon_mapper') +def test_make_location_with_site_date_later_than_date_created(mock_lexicon_mapper): """Test data anomaly: SiteDate is later than DateCreated (should still be accepted)""" + # Mock lexicon mapper to avoid GCS calls + mock_lexicon_mapper.map_value.return_value = "GPS" + row = pd.Series( { "PointID": "TEST-003", @@ -131,8 +143,12 @@ def test_make_location_with_site_date_later_than_date_created(): assert location.legacy_site_date == datetime.date(2015, 6, 20) -def test_make_location_with_very_old_site_date(): +@patch('transfers.util.lexicon_mapper') +def test_make_location_with_very_old_site_date(mock_lexicon_mapper): """Test that very old SiteDates (1950s) are preserved correctly""" + # Mock lexicon mapper to avoid GCS calls + mock_lexicon_mapper.map_value.return_value = "GPS" + row = pd.Series( { "PointID": "SM-0227", # Real example from dataset @@ -163,8 +179,12 @@ def test_make_location_with_very_old_site_date(): assert time_gap == 19751 # Approximately 54 years -def test_make_location_legacy_dates_are_date_not_datetime(): +@patch('transfers.util.lexicon_mapper') +def test_make_location_legacy_dates_are_date_not_datetime(mock_lexicon_mapper): """Test that legacy date fields are Date type (not DateTime)""" + # Mock lexicon mapper to avoid GCS calls + mock_lexicon_mapper.map_value.return_value = "GPS" + row = pd.Series( { "PointID": "TEST-004", @@ -198,8 +218,12 @@ def test_make_location_legacy_dates_are_date_not_datetime(): assert location.legacy_site_date == datetime.date(2002, 12, 10) -def test_make_location_legacy_dates_independent_of_created_at(): +@patch('transfers.util.lexicon_mapper') +def test_make_location_legacy_dates_independent_of_created_at(mock_lexicon_mapper): """Test that legacy dates don't affect created_at timestamp""" + # Mock lexicon mapper to avoid GCS calls + mock_lexicon_mapper.map_value.return_value = "GPS" + row = pd.Series( { "PointID": "TEST-005", @@ -323,8 +347,12 @@ def test_create_well_completed_on_is_date_not_datetime(): # ============================================================================ -def test_location_legacy_date_coverage_statistics(): +@patch('transfers.util.lexicon_mapper') +def test_location_legacy_date_coverage_statistics(mock_lexicon_mapper): """Test that migration preserves expected percentages of legacy dates""" + # Mock lexicon mapper to avoid GCS calls + mock_lexicon_mapper.map_value.return_value = "GPS" + # Simulate 100 location records from CSV locations_created = 0 locations_with_site_date = 0 From de1e5cb916a2fe9e577b8a85e509cc1144ad95f7 Mon Sep 17 00:00:00 2001 From: Kimball Bighorse Date: Tue, 2 Dec 2025 22:55:25 -0800 Subject: [PATCH 37/66] Update column name in BDD tests --- .../steps/post_migration_legacy_data.py | 152 +++++++++--------- 1 file changed, 76 insertions(+), 76 deletions(-) diff --git a/tests/features/steps/post_migration_legacy_data.py b/tests/features/steps/post_migration_legacy_data.py index e78afbde7..162358308 100644 --- a/tests/features/steps/post_migration_legacy_data.py +++ b/tests/features/steps/post_migration_legacy_data.py @@ -31,7 +31,7 @@ def parse_number(text): register_type(Number=parse_number) -def create_test_location(legacy_date_created=None, inventoried_on=None): +def create_test_location(legacy_date_created=None, legacy_site_date=None): """Helper to create a test location with legacy dates.""" with session_ctx() as session: location = Location( @@ -39,7 +39,7 @@ def create_test_location(legacy_date_created=None, inventoried_on=None): elevation=1558.8, release_status="public", legacy_date_created=legacy_date_created, - inventoried_on=inventoried_on, + legacy_site_date=legacy_site_date, ) session.add(location) session.commit() @@ -99,14 +99,14 @@ def step_given_location_with_table(context: Context): if data.get("legacy_date_created") and data["legacy_date_created"] != "null" else None ) - inventoried_on = ( - date.fromisoformat(data["inventoried_on"]) - if data.get("inventoried_on") and data["inventoried_on"] != "null" + legacy_site_date = ( + date.fromisoformat(data["legacy_site_date"]) + if data.get("legacy_site_date") and data["legacy_site_date"] != "null" else None ) location = create_test_location( - legacy_date_created=legacy_date_created, inventoried_on=inventoried_on + legacy_date_created=legacy_date_created, legacy_site_date=legacy_site_date ) context.test_location = location @@ -127,28 +127,28 @@ def step_given_multiple_locations(context: Context, count: int): ] for i in range(min(count, len(test_data))): - legacy_date, inventory_date = test_data[i] + legacy_date, site_date = test_data[i] location = create_test_location( legacy_date_created=date.fromisoformat(legacy_date), - inventoried_on=( - date.fromisoformat(inventory_date) if inventory_date else None + legacy_site_date=( + date.fromisoformat(site_date) if site_date else None ), ) context.test_locations.append(location) @given( - "locations exist with inventoried_on ranging from {start_year:Number} to {end_year:Number}" + "locations exist with legacy_site_date ranging from {start_year:Number} to {end_year:Number}" ) def step_given_locations_date_range(context: Context, start_year: int, end_year: int): - """Create locations with inventoried_on across a date range.""" + """Create locations with legacy_site_date across a date range.""" context.test_locations = [] years = [1954, 2002, 2003, 2010, 2015, 2020, 2024] for year in years: location = create_test_location( - legacy_date_created=date(year + 5, 1, 1), # Always 5 years after inventory - inventoried_on=date(year, 6, 15), + legacy_date_created=date(year + 5, 1, 1), # Always 5 years after site date + legacy_site_date=date(year, 6, 15), ) context.test_locations.append(location) @@ -166,7 +166,7 @@ def step_given_locations_with_specific_date( for i in range(count): location = create_test_location( legacy_date_created=target, - inventoried_on=date(2000 + i, 1, 1), # Vary the inventory dates + legacy_site_date=date(2000 + i, 1, 1), # Vary the site dates ) context.test_locations.append(location) @@ -261,16 +261,16 @@ def step_given_well_location_has_table(context: Context): if data.get("legacy_date_created") else None ) - inventoried_on = ( - date.fromisoformat(data.get("inventoried_on")) - if data.get("inventoried_on") + legacy_site_date = ( + date.fromisoformat(data.get("legacy_site_date")) + if data.get("legacy_site_date") else None ) with session_ctx() as session: location = session.get(Location, context.test_well_location.id) location.legacy_date_created = legacy_date_created - location.inventoried_on = inventoried_on + location.legacy_site_date = legacy_site_date session.commit() session.refresh(location) context.test_well_location = location @@ -282,12 +282,12 @@ def step_given_count_locations_migrated(context: Context, count: int): context.test_locations = [] for i in range(count): - # 9% have inventoried_on - has_inventory = i < count * 0.09 + # 9% have legacy_site_date + has_site_date = i < count * 0.09 location = create_test_location( legacy_date_created=date(2014, 1, i % 28 + 1), - inventoried_on=date(2003, 1, i % 28 + 1) if has_inventory else None, + legacy_site_date=date(2003, 1, i % 28 + 1) if has_site_date else None, ) context.test_locations.append(location) @@ -323,7 +323,7 @@ def step_given_completion_count(context: Context, count: int): def step_given_location_migrated_with_dates(context: Context): """Create location with both legacy dates.""" location = create_test_location( - legacy_date_created=date(2014, 4, 3), inventoried_on=date(2002, 12, 10) + legacy_date_created=date(2014, 4, 3), legacy_site_date=date(2002, 12, 10) ) context.test_location = location @@ -364,7 +364,7 @@ def step_when_get_all_locations(context: Context): @when( - 'I filter locations where inventoried_on is between "{start_date}" and "{end_date}"' + 'I filter locations where legacy_site_date is between "{start_date}" and "{end_date}"' ) def step_when_filter_locations(context: Context, start_date: str, end_date: str): """Filter locations by date range.""" @@ -375,7 +375,7 @@ def step_when_filter_locations(context: Context, start_date: str, end_date: str) locations = ( session.query(Location) - .filter(Location.inventoried_on >= start, Location.inventoried_on <= end) + .filter(Location.legacy_site_date >= start, Location.legacy_site_date <= end) .all() ) @@ -509,10 +509,10 @@ def step_then_legacy_date_created(context: Context, expected_date: str): assert actual == expected_date, f"Expected {expected_date}, got {actual}" -@then('the response should include inventoried_on as "{expected_date}"') -def step_then_inventoried_on(context: Context, expected_date: str): - """Assert inventoried_on matches.""" - actual = context.location_response.get("inventoried_on") +@then('the response should include legacy_site_date as "{expected_date}"') +def step_then_legacy_site_date(context: Context, expected_date: str): + """Assert legacy_site_date matches.""" + actual = context.location_response.get("legacy_site_date") assert actual == expected_date, f"Expected {expected_date}, got {actual}" @@ -520,15 +520,15 @@ def step_then_inventoried_on(context: Context, expected_date: str): def step_then_time_gap_years(context: Context, years: str): """Assert approximate year gap.""" legacy_str = context.location_response.get("legacy_date_created") - inventory_str = context.location_response.get("inventoried_on") + site_date_str = context.location_response.get("legacy_site_date") - if not legacy_str or not inventory_str: + if not legacy_str or not site_date_str: raise AssertionError("Missing date fields for gap calculation") legacy_date = date.fromisoformat(legacy_str) - inventory_date = date.fromisoformat(inventory_str) + site_date = date.fromisoformat(site_date_str) - gap_days = (legacy_date - inventory_date).days + gap_days = (legacy_date - site_date).days gap_years = gap_days / 365.25 expected_years = float(years) @@ -546,47 +546,47 @@ def step_then_all_have_legacy_field(context: Context): assert "legacy_date_created" in item, f"Location missing legacy_date_created" -@then("each location should have an inventoried_on field") -def step_then_all_have_inventory_field(context: Context): +@then("each location should have a legacy_site_date field") +def step_then_all_have_site_date_field(context: Context): """Assert all locations have the field.""" items = context.locations_response.get("items", []) for item in items: - assert "inventoried_on" in item, f"Location missing inventoried_on" + assert "legacy_site_date" in item, f"Location missing legacy_site_date" -@then("some locations should have null inventoried_on") -def step_then_some_null_inventory(context: Context): +@then("some locations should have null legacy_site_date") +def step_then_some_null_site_date(context: Context): """Assert some locations have null.""" items = context.locations_response.get("items", []) - null_count = sum(1 for item in items if item.get("inventoried_on") is None) - assert null_count > 0, "Expected at least one location with null inventoried_on" + null_count = sum(1 for item in items if item.get("legacy_site_date") is None) + assert null_count > 0, "Expected at least one location with null legacy_site_date" -@then("the response should only include locations inventoried in that decade") +@then("the response should only include locations with site date in that decade") def step_then_locations_in_decade(context: Context): """Assert filtered locations are in range.""" for loc in context.filtered_locations: assert ( - 2000 <= loc.inventoried_on.year <= 2010 - ), f"Location not in 2000-2010: {loc.inventoried_on}" + 2000 <= loc.legacy_site_date.year <= 2010 + ), f"Location not in 2000-2010: {loc.legacy_site_date}" -@then("locations inventoried before {year:Number} should not be included") +@then("locations with site date before {year:Number} should not be included") def step_then_locations_before_excluded(context: Context, year: int): """Assert no locations before year.""" for loc in context.filtered_locations: assert ( - loc.inventoried_on.year >= year - ), f"Location from {loc.inventoried_on.year} should not be included" + loc.legacy_site_date.year >= year + ), f"Location from {loc.legacy_site_date.year} should not be included" -@then("locations inventoried after {year:Number} should not be included") +@then("locations with site date after {year:Number} should not be included") def step_then_locations_after_excluded(context: Context, year: int): """Assert no locations after year.""" for loc in context.filtered_locations: assert ( - loc.inventoried_on.year <= year - ), f"Location from {loc.inventoried_on.year} should not be included" + loc.legacy_site_date.year <= year + ), f"Location from {loc.legacy_site_date.year} should not be included" @then("the response should include exactly {count:Number} locations") @@ -721,44 +721,44 @@ def step_then_location_has_legacy(context: Context, expected_date: str): assert actual == expected_date, f"Expected {expected_date}, got {actual}" -@then('the current_location should include inventoried_on as "{expected_date}"') -def step_then_location_has_inventory(context: Context, expected_date: str): - """Assert location has inventoried_on.""" +@then('the current_location should include legacy_site_date as "{expected_date}"') +def step_then_location_has_site_date(context: Context, expected_date: str): + """Assert location has legacy_site_date.""" current_location = context.well_response.get("current_location", {}) - actual = current_location.get("inventoried_on") + actual = current_location.get("legacy_site_date") assert actual == expected_date, f"Expected {expected_date}, got {actual}" @then( - "the temporal sequence should be: well_completed_on → inventoried_on → legacy_date_created" + "the temporal sequence should be: well_completed_on → legacy_site_date → legacy_date_created" ) def step_then_temporal_sequence(context: Context): """Assert temporal order.""" well_completed = context.retrieved_well.well_completed_on - inventoried = context.retrieved_location.inventoried_on + site_date = context.retrieved_location.legacy_site_date legacy_created = context.retrieved_location.legacy_date_created assert ( - well_completed < inventoried - ), "Well should be completed before site inventoried" + well_completed < site_date + ), "Well should be completed before site date" assert ( - inventoried < legacy_created - ), "Site should be inventoried before DB record created" + site_date < legacy_created + ), "Site date should be before DB record created" @then("the timeline should show: {year1:Number} → {year2:Number} → {year3:Number}") def step_then_timeline_years(context: Context, year1: int, year2: int, year3: int): """Assert specific years in sequence.""" assert context.retrieved_well.well_completed_on.year == year1 - assert context.retrieved_location.inventoried_on.year == year2 + assert context.retrieved_location.legacy_site_date.year == year2 assert context.retrieved_location.legacy_date_created.year == year3 -@then("{percentage:Number}% should have non-null inventoried_on") -def step_then_percentage_inventory(context: Context, percentage: int): - """Assert percentage with inventoried_on.""" +@then("{percentage:Number}% should have non-null legacy_site_date") +def step_then_percentage_site_date(context: Context, percentage: int): + """Assert percentage with legacy_site_date.""" total = len(context.queried_locations) - populated = sum(1 for loc in context.queried_locations if loc.inventoried_on) + populated = sum(1 for loc in context.queried_locations if loc.legacy_site_date) actual_pct = (populated / total) * 100 tolerance = 2 @@ -805,10 +805,10 @@ def step_then_has_legacy_date(context: Context): assert context.retrieved_location.legacy_date_created is not None -@then("it should have inventoried_on (original AMPAPI SiteDate)") -def step_then_has_inventory_date(context: Context): - """Assert inventoried_on exists.""" - assert context.retrieved_location.inventoried_on is not None +@then("it should have legacy_site_date (original AMPAPI SiteDate)") +def step_then_has_site_date(context: Context): + """Assert legacy_site_date exists.""" + assert context.retrieved_location.legacy_site_date is not None @then("all three timestamps should be independently queryable") @@ -816,7 +816,7 @@ def step_then_all_queryable(context: Context): """Assert all fields are queryable.""" assert hasattr(context.retrieved_location, "created_at") assert hasattr(context.retrieved_location, "legacy_date_created") - assert hasattr(context.retrieved_location, "inventoried_on") + assert hasattr(context.retrieved_location, "legacy_site_date") @then("created_at should be a recent timestamp") @@ -843,10 +843,10 @@ def step_then_legacy_is(context: Context, expected_date: str): assert actual == expected, f"Expected {expected}, got {actual}" -@then('inventoried_on should be "{expected_date}"') -def step_then_inventory_is(context: Context, expected_date: str): - """Assert inventoried_on value.""" - actual = context.retrieved_location.inventoried_on +@then('legacy_site_date should be "{expected_date}"') +def step_then_site_date_is(context: Context, expected_date: str): + """Assert legacy_site_date value.""" + actual = context.retrieved_location.legacy_site_date expected = date.fromisoformat(expected_date) assert actual == expected, f"Expected {expected}, got {actual}" @@ -880,10 +880,10 @@ def step_then_no_validation_errors(context: Context): pass -@then("inventoried_on should be null") -def step_then_inventory_null(context: Context): - """Assert inventoried_on is null.""" - assert context.retrieved_location.inventoried_on is None +@then("legacy_site_date should be null") +def step_then_site_date_null(context: Context): + """Assert legacy_site_date is null.""" + assert context.retrieved_location.legacy_site_date is None @then("the well should still be valid") From a9293bb71260a303a609212f973e9e9bb3451995 Mon Sep 17 00:00:00 2001 From: kbighorse Date: Wed, 3 Dec 2025 06:55:26 +0000 Subject: [PATCH 38/66] Formatting changes --- .../features/steps/post_migration_legacy_data.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/tests/features/steps/post_migration_legacy_data.py b/tests/features/steps/post_migration_legacy_data.py index 162358308..b36dfa461 100644 --- a/tests/features/steps/post_migration_legacy_data.py +++ b/tests/features/steps/post_migration_legacy_data.py @@ -130,9 +130,7 @@ def step_given_multiple_locations(context: Context, count: int): legacy_date, site_date = test_data[i] location = create_test_location( legacy_date_created=date.fromisoformat(legacy_date), - legacy_site_date=( - date.fromisoformat(site_date) if site_date else None - ), + legacy_site_date=(date.fromisoformat(site_date) if site_date else None), ) context.test_locations.append(location) @@ -375,7 +373,9 @@ def step_when_filter_locations(context: Context, start_date: str, end_date: str) locations = ( session.query(Location) - .filter(Location.legacy_site_date >= start, Location.legacy_site_date <= end) + .filter( + Location.legacy_site_date >= start, Location.legacy_site_date <= end + ) .all() ) @@ -738,12 +738,8 @@ def step_then_temporal_sequence(context: Context): site_date = context.retrieved_location.legacy_site_date legacy_created = context.retrieved_location.legacy_date_created - assert ( - well_completed < site_date - ), "Well should be completed before site date" - assert ( - site_date < legacy_created - ), "Site date should be before DB record created" + assert well_completed < site_date, "Well should be completed before site date" + assert site_date < legacy_created, "Site date should be before DB record created" @then("the timeline should show: {year1:Number} → {year2:Number} → {year3:Number}") From dc7a31b93ddf564af0d0905a788108087cc32e93 Mon Sep 17 00:00:00 2001 From: Kimball Bighorse Date: Wed, 3 Dec 2025 00:30:21 -0800 Subject: [PATCH 39/66] Remove `well_completed_on` --- db/thing.py | 5 - schemas/thing.py | 6 - ...st-migration-legacy-data-retrieval.feature | 78 --- .../steps/post_migration_legacy_data.py | 445 ------------------ tests/test_thing.py | 144 ------ tests/test_transfer_legacy_dates.py | 125 ----- transfers/well_transfer.py | 15 - 7 files changed, 818 deletions(-) diff --git a/db/thing.py b/db/thing.py index b42b70d56..9f30d08e2 100644 --- a/db/thing.py +++ b/db/thing.py @@ -115,11 +115,6 @@ class Thing( ) well_construction_notes: Mapped[str] = mapped_column(Text, nullable=True) - well_completed_on: Mapped[date] = mapped_column( - Date, - nullable=True, - comment="Date when well construction/drilling was completed (from AMPAPI CompletionDate, active field for new wells)", - ) # Spring-related columns spring_type: Mapped[str] = lexicon_term( diff --git a/schemas/thing.py b/schemas/thing.py index 6de5908cc..692b78459 100644 --- a/schemas/thing.py +++ b/schemas/thing.py @@ -131,8 +131,6 @@ class CreateWell(CreateBaseThing, ValidateWell): ) measuring_point_description: str | None notes: list[CreateNote] | None = None - # Active field: users can set this for new wells - well_completed_on: date | None = None class CreateSpring(CreateBaseThing): @@ -227,8 +225,6 @@ class WellResponse(BaseThingResponse): measuring_point_height: float measuring_point_height_unit: str = "ft" measuring_point_description: str | None - # Active field: completion date for wells - well_completed_on: date | None = None water_notes: list[NoteResponse] | None = None measuring_notes: list[NoteResponse] | None = None @@ -334,8 +330,6 @@ class UpdateWell(UpdateThing, ValidateWell): well_casing_diameter: float | None = None # in inches well_casing_depth: float | None = None # in feet well_casing_materials: list[str] | None = None - # Active field: users can update completion date - well_completed_on: date | None = None class UpdateSpring(UpdateThing): diff --git a/tests/features/post-migration-legacy-data-retrieval.feature b/tests/features/post-migration-legacy-data-retrieval.feature index fa4663e1b..b5329ad9c 100644 --- a/tests/features/post-migration-legacy-data-retrieval.feature +++ b/tests/features/post-migration-legacy-data-retrieval.feature @@ -51,65 +51,6 @@ Feature: Post-Migration Legacy Data Retrieval Then the response should include exactly 3 locations And all should have legacy_date_created "2014-04-03" - # Well Completion Date Lookups - - Scenario: Retrieve well with completion date via API - Given a well exists with well_completed_on "2004-08-08" - When I retrieve that well via the API - Then the response should include well_completed_on as "2004-08-08" - And the well age should be calculable - - Scenario: Retrieve old well from early 1900s - Given a well exists with well_completed_on "1936-01-01" - When I retrieve that well via the API - Then the response should include well_completed_on as "1936-01-01" - And the well should be over 88 years old - - Scenario: List all wells includes completion date field - Given 10 wells exist with various completion dates - And 3 of those wells have null well_completed_on - When I GET /thing/water-well to list all wells - Then each well should have a well_completed_on field - And 70% of wells should have well_completed_on populated - - Scenario: Filter wells by completion date range - Given wells exist with completion dates from 1936 to 2024 - When I filter wells where well_completed_on is between "2000-01-01" and "2010-12-31" - Then the response should only include wells completed in that decade - And wells from 1936 should not be included - And wells from 2020 should not be included - - Scenario: Sort wells by completion date (oldest first) - Given wells exist with completion dates: 1936, 1965, 2004, 2020 - And some wells have null well_completed_on - When I GET /thing/water-well sorted by well_completed_on ascending - Then the first well should be from 1936 - And the last well with a date should be from 2020 - And wells without completion dates should appear last - - # Combined Queries - Location + Well Legacy Dates - - Scenario: Retrieve well with location showing all legacy dates - Given a well exists with well_completed_on "2004-08-08" - And that well's location has: - | field | value | - | legacy_date_created | 2014-04-03 | - | legacy_site_date | 2002-12-10 | - When I retrieve the well via the API - Then the well should have well_completed_on as "2004-08-08" - And the current_location should include legacy_date_created as "2014-04-03" - And the current_location should include legacy_site_date as "2002-12-10" - - Scenario: Timeline reconstruction - well completed before site inventoried - Given a well exists with well_completed_on "1995-06-15" - And that well's location has: - | field | value | - | legacy_site_date | 2003-12-10 | - | legacy_date_created | 2014-04-03 | - When I retrieve the well and its location - Then the temporal sequence should be: well_completed_on → legacy_site_date → legacy_date_created - And the timeline should show: 1995 → 2003 → 2014 - # Data Quality Validation Scenario: Verify migration preserved expected percentage of legacy dates @@ -119,12 +60,6 @@ Feature: Post-Migration Legacy Data Retrieval Then 9% should have non-null legacy_site_date And 100% should have non-null legacy_date_created - Scenario: Verify well completion date coverage matches expectation - Given 100 wells were migrated - And 30 of them had non-null CompletionDate in AMPAPI - When I query the migrated wells - Then 30% should have non-null well_completed_on - # Audit Trail Verification Scenario: Legacy dates preserved alongside audit timestamps @@ -149,13 +84,6 @@ Feature: Post-Migration Legacy Data Retrieval And legacy_site_date should be "2015-06-20" And the system should accept this without error - Scenario: Spring does not use well_completed_on field - Given a thing of type "spring" exists - When I retrieve that spring - Then well_completed_on should be null - And the field should exist in the response schema - And it should not cause validation errors - Scenario: Location with only legacy_date_created (no legacy_site_date) Given a location exists with: | field | value | @@ -164,9 +92,3 @@ Feature: Post-Migration Legacy Data Retrieval When I retrieve that location Then legacy_date_created should be "2014-10-17" And legacy_site_date should be null - - Scenario: Well without completion date - Given a well exists with well_completed_on null - When I retrieve that well - Then well_completed_on should be null - And the well should still be valid diff --git a/tests/features/steps/post_migration_legacy_data.py b/tests/features/steps/post_migration_legacy_data.py index b36dfa461..25e932159 100644 --- a/tests/features/steps/post_migration_legacy_data.py +++ b/tests/features/steps/post_migration_legacy_data.py @@ -47,42 +47,6 @@ def create_test_location(legacy_date_created=None, legacy_site_date=None): return location -def create_test_well(well_completed_on=None, thing_type="water well"): - """Helper to create a test well with completion date.""" - with session_ctx() as session: - # Create location - location = Location( - point="POINT(-106.607784 35.118924)", - elevation=1558.8, - release_status="public", - ) - session.add(location) - session.commit() - - # Create thing - thing = Thing( - name=f"Test-{thing_type}-{datetime.now().timestamp()}", - first_visit_date="2023-03-03", - thing_type=thing_type, - release_status="public", - well_depth=100.0 if thing_type == "water well" else None, - hole_depth=110.0 if thing_type == "water well" else None, - well_completed_on=well_completed_on, - ) - session.add(thing) - session.commit() - - # Associate - assoc = LocationThingAssociation(location=location, thing=thing) - assoc.effective_start = "2000-01-01T00:00:00Z" - session.add(assoc) - session.commit() - - session.refresh(thing) - session.refresh(location) - return thing, location - - @given("the AMPAPI data has been migrated to the database") def step_given_data_migrated(context: Context): """Assumption that migration has occurred.""" @@ -169,111 +133,6 @@ def step_given_locations_with_specific_date( context.test_locations.append(location) -@given('a well exists with well_completed_on "{completion_date}"') -def step_given_well_with_completion(context: Context, completion_date: str): - """Create well with completion date.""" - completed_on = ( - date.fromisoformat(completion_date) if completion_date != "null" else None - ) - - thing, location = create_test_well(well_completed_on=completed_on) - - context.test_well = thing - context.test_well_id = thing.id - context.test_well_location = location - - -@given("{count:Number} wells exist with various completion dates") -def step_given_multiple_wells(context: Context, count: int): - """Create multiple wells with various completion dates.""" - context.test_wells = [] - - completion_dates = [ - "1936-01-01", - "1965-06-15", - "2004-08-08", - "2020-05-15", - None, # No completion date - None, - None, - ] - - for i in range(min(count, len(completion_dates))): - completed_on = ( - date.fromisoformat(completion_dates[i]) if completion_dates[i] else None - ) - thing, location = create_test_well(well_completed_on=completed_on) - context.test_wells.append(thing) - - -@given("{null_count:Number} of those wells have null well_completed_on") -def step_given_wells_with_null_completion(context: Context, null_count: int): - """Verify expected number of nulls (declarative - already created).""" - # Wells were created in previous step with nulls - pass - - -@given( - "wells exist with completion dates from {start_year:Number} to {end_year:Number}" -) -def step_given_wells_date_range(context: Context, start_year: int, end_year: int): - """Create wells with completion dates across range.""" - context.test_wells = [] - - years = [1936, 1965, 2004, 2010, 2020, 2024] - for year in years: - thing, location = create_test_well(well_completed_on=date(year, 6, 15)) - context.test_wells.append(thing) - - -@given("wells exist with completion dates: {years}") -def step_given_wells_specific_years(context: Context, years: str): - """Create wells with specific completion years.""" - context.test_wells = [] - - year_list = [int(y.strip()) for y in years.split(",")] - - for year in year_list: - thing, location = create_test_well(well_completed_on=date(year, 6, 15)) - context.test_wells.append(thing) - - -@given("some wells have null well_completed_on") -def step_given_some_wells_null(context: Context): - """Add wells without completion dates.""" - if not hasattr(context, "test_wells"): - context.test_wells = [] - - for i in range(2): - thing, location = create_test_well(well_completed_on=None) - context.test_wells.append(thing) - - -@given("that well's location has") -def step_given_well_location_has_table(context: Context): - """Set legacy dates on the well's location.""" - data = {row["field"]: row["value"] for row in context.table} - - legacy_date_created = ( - date.fromisoformat(data.get("legacy_date_created")) - if data.get("legacy_date_created") - else None - ) - legacy_site_date = ( - date.fromisoformat(data.get("legacy_site_date")) - if data.get("legacy_site_date") - else None - ) - - with session_ctx() as session: - location = session.get(Location, context.test_well_location.id) - location.legacy_date_created = legacy_date_created - location.legacy_site_date = legacy_site_date - session.commit() - session.refresh(location) - context.test_well_location = location - - @given("{count:Number} locations were migrated") def step_given_count_locations_migrated(context: Context, count: int): """Create specified number of test locations.""" @@ -296,27 +155,6 @@ def step_given_sitedate_count(context: Context, count: int): pass -@given("{count:Number} wells were migrated") -def step_given_count_wells_migrated(context: Context, count: int): - """Create specified number of test wells.""" - context.test_wells = [] - - for i in range(count): - # 30% have completion dates - has_completion = i < count * 0.30 - - thing, location = create_test_well( - well_completed_on=date(2000 + (i % 24), 1, 1) if has_completion else None - ) - context.test_wells.append(thing) - - -@given("{count:Number} of them had non-null CompletionDate in AMPAPI") -def step_given_completion_count(context: Context, count: int): - """Declarative - data created in previous step.""" - pass - - @given("a location was migrated with legacy dates") def step_given_location_migrated_with_dates(context: Context): """Create location with both legacy dates.""" @@ -326,22 +164,6 @@ def step_given_location_migrated_with_dates(context: Context): context.test_location = location -@given('a thing of type "{thing_type}" exists') -def step_given_thing_of_type(context: Context, thing_type: str): - """Create a thing of specified type.""" - thing, location = create_test_well(well_completed_on=None, thing_type=thing_type) - context.test_thing = thing - context.test_thing_id = thing.id - - -@given("a well exists with well_completed_on null") -def step_given_well_null_completion(context: Context): - """Create well without completion date.""" - thing, location = create_test_well(well_completed_on=None) - context.test_well = thing - context.test_well_id = thing.id - - # WHEN steps @@ -393,69 +215,6 @@ def step_when_query_by_legacy_date(context: Context, target_date: str): context.queried_locations = locations -@when("I retrieve that well via the API") -def step_when_retrieve_well_api(context: Context): - """Retrieve well via GET API.""" - response = context.client.get(f"/thing/water-well/{context.test_well_id}") - assert response.status_code == 200 - context.well_response = response.json() - - -@when("I GET /thing/water-well to list all wells") -def step_when_get_all_wells(context: Context): - """Get all wells.""" - response = context.client.get("/thing/water-well") - assert response.status_code == 200 - context.wells_response = response.json() - - -@when( - 'I filter wells where well_completed_on is between "{start_date}" and "{end_date}"' -) -def step_when_filter_wells(context: Context, start_date: str, end_date: str): - """Filter wells by completion date range.""" - with session_ctx() as session: - start = date.fromisoformat(start_date) - end = date.fromisoformat(end_date) - - wells = ( - session.query(Thing) - .filter( - Thing.thing_type == "water well", - Thing.well_completed_on >= start, - Thing.well_completed_on <= end, - ) - .all() - ) - - context.filtered_wells = wells - - -@when("I GET /thing/water-well sorted by well_completed_on ascending") -def step_when_get_wells_sorted(context: Context): - """Get wells sorted by completion date.""" - with session_ctx() as session: - wells = ( - session.query(Thing) - .filter(Thing.thing_type == "water well") - .order_by(Thing.well_completed_on.asc().nullslast()) - .all() - ) - - context.sorted_wells = wells - - -@when("I retrieve the well and its location") -def step_when_retrieve_well_and_location(context: Context): - """Retrieve well with location.""" - with session_ctx() as session: - well = session.get(Thing, context.test_well.id) - location = session.get(Location, context.test_well_location.id) - - context.retrieved_well = well - context.retrieved_location = location - - @when("I query the migrated locations") def step_when_query_migrated_locations(context: Context): """Query all test locations.""" @@ -466,15 +225,6 @@ def step_when_query_migrated_locations(context: Context): context.queried_locations = locations -@when("I query the migrated wells") -def step_when_query_migrated_wells(context: Context): - """Query all test wells.""" - with session_ctx() as session: - well_ids = [well.id for well in context.test_wells] - wells = session.query(Thing).filter(Thing.id.in_(well_ids)).all() - context.queried_wells = wells - - @when("I retrieve that location") def step_when_retrieve_location(context: Context): """Retrieve location by ID.""" @@ -483,22 +233,6 @@ def step_when_retrieve_location(context: Context): context.retrieved_location = location -@when("I retrieve that spring") -def step_when_retrieve_spring(context: Context): - """Retrieve spring/thing by ID.""" - with session_ctx() as session: - thing = session.get(Thing, context.test_thing.id) - context.retrieved_thing = thing - - -@when("I retrieve that well") -def step_when_retrieve_well(context: Context): - """Retrieve well by ID.""" - with session_ctx() as session: - well = session.get(Thing, context.test_well.id) - context.retrieved_well = well - - # THEN steps @@ -606,150 +340,6 @@ def step_then_all_have_date(context: Context, expected_date: str): ), f"Location has {loc.legacy_date_created}, expected {expected}" -@then('the response should include well_completed_on as "{expected_date}"') -def step_then_well_completed_on(context: Context, expected_date: str): - """Assert well_completed_on matches.""" - actual = context.well_response.get("well_completed_on") - assert actual == expected_date, f"Expected {expected_date}, got {actual}" - - -@then("the well age should be calculable") -def step_then_age_calculable(context: Context): - """Assert age can be calculated.""" - completion_str = context.well_response.get("well_completed_on") - assert completion_str is not None, "Cannot calculate age without completion date" - - completed = date.fromisoformat(completion_str) - today = date.today() - age_years = (today - completed).days / 365.25 - assert age_years >= 0, "Age cannot be negative" - - -@then("the well should be over {min_age:Number} years old") -def step_then_well_over_age(context: Context, min_age: int): - """Assert well age exceeds minimum.""" - completion_str = context.well_response.get("well_completed_on") - completed = date.fromisoformat(completion_str) - today = date.today() - age_years = (today - completed).days / 365.25 - - assert age_years >= min_age, f"Expected over {min_age} years, got {age_years:.1f}" - - -@then("each well should have a well_completed_on field") -def step_then_all_wells_have_field(context: Context): - """Assert all wells have the field.""" - items = context.wells_response.get("items", []) - for item in items: - assert "well_completed_on" in item, f"Well missing well_completed_on" - - -@then("{percentage:Number}% of wells should have well_completed_on populated") -def step_then_percentage_populated(context: Context, percentage: int): - """Assert approximate percentage.""" - items = context.wells_response.get("items", []) - total = len(items) - if total == 0: - return - - populated = sum(1 for item in items if item.get("well_completed_on") is not None) - actual_pct = (populated / total) * 100 - - tolerance = 10 - assert ( - abs(actual_pct - percentage) < tolerance - ), f"Expected ~{percentage}%, got {actual_pct:.1f}%" - - -@then("the response should only include wells completed in that decade") -def step_then_wells_in_decade(context: Context): - """Assert filtered wells in range.""" - for well in context.filtered_wells: - assert 2000 <= well.well_completed_on.year <= 2010 - - -@then("wells from {year:Number} should not be included") -def step_then_wells_year_excluded(context: Context, year: int): - """Assert wells from year excluded.""" - for well in context.filtered_wells: - assert well.well_completed_on.year != year - - -@then("the first well should be from {year:Number}") -def step_then_first_well_year(context: Context, year: int): - """Assert first well year.""" - if context.sorted_wells and context.sorted_wells[0].well_completed_on: - actual_year = context.sorted_wells[0].well_completed_on.year - assert actual_year == year, f"Expected {year}, got {actual_year}" - - -@then("the last well with a date should be from {year:Number}") -def step_then_last_well_year(context: Context, year: int): - """Assert last non-null well year.""" - non_null = [w for w in context.sorted_wells if w.well_completed_on] - if non_null: - actual_year = non_null[-1].well_completed_on.year - assert actual_year == year, f"Expected {year}, got {actual_year}" - - -@then("wells without completion dates should appear last") -def step_then_nulls_last(context: Context): - """Assert nulls at end.""" - first_null_idx = next( - (i for i, w in enumerate(context.sorted_wells) if w.well_completed_on is None), - len(context.sorted_wells), - ) - - for well in context.sorted_wells[first_null_idx:]: - assert ( - well.well_completed_on is None - ), "Found non-null after null in sorted list" - - -@then('the well should have well_completed_on as "{expected_date}"') -def step_then_well_has_completion(context: Context, expected_date: str): - """Assert well has completion date.""" - actual = context.well_response.get("well_completed_on") - assert actual == expected_date, f"Expected {expected_date}, got {actual}" - - -@then('the current_location should include legacy_date_created as "{expected_date}"') -def step_then_location_has_legacy(context: Context, expected_date: str): - """Assert location has legacy_date_created.""" - current_location = context.well_response.get("current_location", {}) - actual = current_location.get("legacy_date_created") - assert actual == expected_date, f"Expected {expected_date}, got {actual}" - - -@then('the current_location should include legacy_site_date as "{expected_date}"') -def step_then_location_has_site_date(context: Context, expected_date: str): - """Assert location has legacy_site_date.""" - current_location = context.well_response.get("current_location", {}) - actual = current_location.get("legacy_site_date") - assert actual == expected_date, f"Expected {expected_date}, got {actual}" - - -@then( - "the temporal sequence should be: well_completed_on → legacy_site_date → legacy_date_created" -) -def step_then_temporal_sequence(context: Context): - """Assert temporal order.""" - well_completed = context.retrieved_well.well_completed_on - site_date = context.retrieved_location.legacy_site_date - legacy_created = context.retrieved_location.legacy_date_created - - assert well_completed < site_date, "Well should be completed before site date" - assert site_date < legacy_created, "Site date should be before DB record created" - - -@then("the timeline should show: {year1:Number} → {year2:Number} → {year3:Number}") -def step_then_timeline_years(context: Context, year1: int, year2: int, year3: int): - """Assert specific years in sequence.""" - assert context.retrieved_well.well_completed_on.year == year1 - assert context.retrieved_location.legacy_site_date.year == year2 - assert context.retrieved_location.legacy_date_created.year == year3 - - @then("{percentage:Number}% should have non-null legacy_site_date") def step_then_percentage_site_date(context: Context, percentage: int): """Assert percentage with legacy_site_date.""" @@ -776,19 +366,6 @@ def step_then_percentage_legacy(context: Context, percentage: int): ), f"Expected ~{percentage}%, got {actual_pct:.1f}%" -@then("{percentage:Number}% should have non-null well_completed_on") -def step_then_percentage_completion(context: Context, percentage: int): - """Assert percentage with well_completed_on.""" - total = len(context.queried_wells) - populated = sum(1 for well in context.queried_wells if well.well_completed_on) - actual_pct = (populated / total) * 100 - - tolerance = 2 - assert ( - abs(actual_pct - percentage) < tolerance - ), f"Expected ~{percentage}%, got {actual_pct:.1f}%" - - @then("it should have created_at (new system timestamp from migration)") def step_then_has_created_at(context: Context): """Assert created_at exists.""" @@ -854,28 +431,6 @@ def step_then_no_error(context: Context): pass -@then("well_completed_on should be null") -def step_then_completion_null(context: Context): - """Assert well_completed_on is null.""" - if hasattr(context, "retrieved_thing"): - assert context.retrieved_thing.well_completed_on is None - elif hasattr(context, "retrieved_well"): - assert context.retrieved_well.well_completed_on is None - - -@then("the field should exist in the response schema") -def step_then_field_exists_in_schema(context: Context): - """Assert field exists in schema.""" - if hasattr(context, "retrieved_thing"): - assert hasattr(context.retrieved_thing, "well_completed_on") - - -@then("it should not cause validation errors") -def step_then_no_validation_errors(context: Context): - """Assert no validation errors.""" - pass - - @then("legacy_site_date should be null") def step_then_site_date_null(context: Context): """Assert legacy_site_date is null.""" diff --git a/tests/test_thing.py b/tests/test_thing.py index 94d00aa85..3792b4302 100644 --- a/tests/test_thing.py +++ b/tests/test_thing.py @@ -1132,148 +1132,4 @@ def test_delete_thing_id_link_404_not_found(second_thing_id_link): assert data["detail"] == f"ThingIdLink with ID {bad_id} not found." -# ============= Well completion date tests ==================================== - - -def test_create_well_with_completion_date(location): - """Test creating a well with well_completed_on (active field - users can set this)""" - payload = { - "name": "Test Well", - "location_id": location.id, - "measuring_point_height": 2.5, - "measuring_point_description": "top of casing", - "release_status": "draft", - "well_completed_on": "2004-08-08", - } - response = client.post("/thing/water-well", json=payload) - - assert response.status_code == 201 - data = response.json() - assert "id" in data - assert data["well_completed_on"] == "2004-08-08" - - # cleanup after test - from db import Thing - from tests import cleanup_post_test - - cleanup_post_test(Thing, data["id"]) - - -def test_create_well_with_old_completion_date(location): - """Test creating a well with very old completion date (e.g., for documenting historical wells)""" - payload = { - "name": "Historical Well", - "location_id": location.id, - "measuring_point_height": 2.5, - "measuring_point_description": "top of casing", - "release_status": "draft", - "well_completed_on": "1936-01-01", - } - response = client.post("/thing/water-well", json=payload) - - assert response.status_code == 201 - data = response.json() - assert data["well_completed_on"] == "1936-01-01" - - # cleanup after test - from db import Thing - from tests import cleanup_post_test - - cleanup_post_test(Thing, data["id"]) - - -def test_create_well_without_completion_date(location): - """Test that well_completed_on is optional (nullable) when creating a well""" - payload = { - "name": "Test Well Without Date", - "location_id": location.id, - "measuring_point_height": 2.5, - "measuring_point_description": "top of casing", - "release_status": "draft", - } - response = client.post("/thing/water-well", json=payload) - - assert response.status_code == 201 - data = response.json() - # Field should be present but null - assert "well_completed_on" in data - assert data["well_completed_on"] is None - - # cleanup after test - from db import Thing - from tests import cleanup_post_test - - cleanup_post_test(Thing, data["id"]) - - -def test_spring_well_completed_on_is_null(location): - """Test that springs do NOT have well_completed_on field (it's well-specific)""" - payload = { - "name": "Test Spring", - "location_id": location.id, - "spring_type": "Artesian", - "release_status": "draft", - } - response = client.post("/thing/spring", json=payload) - - assert response.status_code == 201 - data = response.json() - # Springs should NOT have well_completed_on field (only wells have completion dates) - assert "well_completed_on" not in data - assert data["thing_type"] == "spring" - - # cleanup after test - from db import Thing - from tests import cleanup_post_test - - cleanup_post_test(Thing, data["id"]) - - -def test_well_with_completion_date_and_location_legacy_fields(location): - """Test combined scenario: new well with completion date + location legacy fields (null for new locations)""" - # Create a new location (without legacy fields - they're migration-only) - from tests import cleanup_post_test - - location_payload = { - "point": "POINT (-106.607784 35.118924)", - "elevation": 1558.8, - "release_status": "draft", - } - location_response = client.post("/location", json=location_payload) - assert location_response.status_code == 201 - location_id = location_response.json()["id"] - - # Create well with completion date at that location - well_payload = { - "name": "Test Well", - "location_id": location_id, - "measuring_point_height": 2.5, - "measuring_point_description": "top of casing", - "release_status": "draft", - "well_completed_on": "2020-06-15", # User can set this for new wells - } - well_response = client.post("/thing/water-well", json=well_payload) - assert well_response.status_code == 201 - well_id = well_response.json()["id"] - - # Retrieve the well - get_response = client.get(f"/thing/water-well/{well_id}") - assert get_response.status_code == 200 - data = get_response.json() - - # well_completed_on is set (active field) - assert data["well_completed_on"] == "2020-06-15" - - # Location legacy fields are null (migration-only fields) - # current_location is a GeoJSON Feature, so fields are under properties - assert data["current_location"]["properties"]["legacy_date_created"] is None - assert data["current_location"]["properties"]["legacy_site_date"] is None - - # cleanup after test - from db import Thing, Location - - cleanup_post_test(Thing, well_id) - cleanup_post_test(Location, location_id) - - # ============= EOF ============================================= diff --git a/tests/test_transfer_legacy_dates.py b/tests/test_transfer_legacy_dates.py index 05dbe8dfe..795820ec8 100644 --- a/tests/test_transfer_legacy_dates.py +++ b/tests/test_transfer_legacy_dates.py @@ -19,7 +19,6 @@ These tests verify that: 1. Location.legacy_date_created is populated from CSV DateCreated 2. Location.legacy_site_date is populated from CSV SiteDate (if not null) -3. Thing.well_completed_on is populated from CSV CompletionDate (if not null) """ import datetime from unittest.mock import Mock, patch, MagicMock @@ -27,7 +26,6 @@ import pytest from transfers.util import make_location -from schemas.thing import CreateWell # ============================================================================ @@ -257,91 +255,6 @@ def test_make_location_legacy_dates_independent_of_created_at(mock_lexicon_mappe assert location.legacy_site_date is not None -# ============================================================================ -# WELL COMPLETION DATE TESTS -# ============================================================================ - - -def test_create_well_schema_accepts_well_completed_on(): - """Test that CreateWell schema accepts well_completed_on from CSV CompletionDate""" - # Simulate data from CSV transfer - well_data = { - "location_id": 1, - "name": "TEST-WELL-001", - "well_completed_on": datetime.date(2004, 8, 8), # From CSV CompletionDate - "hole_depth": 100.0, - "well_depth": 95.0, - "measuring_point_height": 2.5, - "measuring_point_description": "top of casing", - "release_status": "public", - } - - # Validate using CreateWell schema - schema = CreateWell(**well_data) - - assert schema.well_completed_on == datetime.date(2004, 8, 8) - - -def test_create_well_schema_well_completed_on_optional(): - """Test that well_completed_on is optional (70% of wells don't have CompletionDate)""" - well_data = { - "location_id": 1, - "name": "TEST-WELL-002", - "hole_depth": 100.0, - "well_depth": 95.0, - "measuring_point_height": 2.5, - "measuring_point_description": "top of casing", - "release_status": "public", - # No well_completed_on provided - } - - # Should not raise validation error - schema = CreateWell(**well_data) - - # Field should be optional - assert hasattr(schema, "well_completed_on") - # Value should be None when not provided - assert schema.well_completed_on is None - - -def test_create_well_with_very_old_completion_date(): - """Test that very old completion dates (1936) are accepted""" - well_data = { - "location_id": 1, - "name": "HISTORICAL-WELL", - "well_completed_on": datetime.date(1936, 1, 1), # Oldest well in dataset - "hole_depth": 100.0, - "well_depth": 95.0, - "measuring_point_height": 2.5, - "measuring_point_description": "top of casing", - "release_status": "public", - } - - schema = CreateWell(**well_data) - - assert schema.well_completed_on == datetime.date(1936, 1, 1) - - -def test_create_well_completed_on_is_date_not_datetime(): - """Test that well_completed_on is Date type (not DateTime)""" - well_data = { - "location_id": 1, - "name": "TEST-WELL-003", - "well_completed_on": datetime.date(2004, 8, 8), # Date, not DateTime - "hole_depth": 100.0, - "well_depth": 95.0, - "measuring_point_height": 2.5, - "measuring_point_description": "top of casing", - "release_status": "public", - } - - schema = CreateWell(**well_data) - - # Should accept date type - assert isinstance(schema.well_completed_on, datetime.date) - assert not isinstance(schema.well_completed_on, datetime.datetime) - - # ============================================================================ # DATA COVERAGE TESTS (Simulating Migration Statistics) # ============================================================================ @@ -410,44 +323,6 @@ def test_location_legacy_date_coverage_statistics(mock_lexicon_mapper): assert locations_with_site_date == 9 # 9% should have legacy_site_date -def test_well_completion_date_coverage_statistics(): - """Test that expected percentage of wells have completion dates""" - # Simulate 100 wells from CSV - wells_with_completion_date = 0 - - for i in range(100): - if i < 30: # 30% have CompletionDate - well_data = { - "location_id": 1, - "name": f"WELL-{i:03d}", - "well_completed_on": datetime.date(2004, 8, 8), - "hole_depth": 100.0, - "well_depth": 95.0, - "measuring_point_height": 2.5, - "measuring_point_description": "top of casing", - "release_status": "public", - } - else: # 70% don't have CompletionDate - well_data = { - "location_id": 1, - "name": f"WELL-{i:03d}", - "hole_depth": 100.0, - "well_depth": 95.0, - "measuring_point_height": 2.5, - "measuring_point_description": "top of casing", - "release_status": "public", - # No well_completed_on - } - - schema = CreateWell(**well_data) - - if schema.well_completed_on is not None: - wells_with_completion_date += 1 - - # Verify expected coverage - assert wells_with_completion_date == 30 # 30% should have completion dates - - # ============================================================================ # EOF # ============================================================================ diff --git a/transfers/well_transfer.py b/transfers/well_transfer.py index 5daa1d8ee..ee54d0216 100644 --- a/transfers/well_transfer.py +++ b/transfers/well_transfer.py @@ -237,19 +237,6 @@ def transfer_wells(session: Session, flags: dict = None, limit: int = 0) -> None [] if isna(row.CasingDescription) else _extract_casing_materials(row) ) - # Extract well_completed_on from CompletionDate (Date type, not DateTime) - well_completed_on = None - if not isna(row.CompletionDate): - try: - well_completed_on = datetime.strptime( - row.CompletionDate, "%Y-%m-%d %H:%M:%S.%f" - ).date() - except (ValueError, AttributeError): - # If parsing fails, leave as None - logger.warning( - f"Could not parse CompletionDate for {row.PointID}: {row.CompletionDate}" - ) - # manually add the well rather than add_well from services/thing_helper.py # so that effective_start can be set on the location assocation @@ -267,7 +254,6 @@ def transfer_wells(session: Session, flags: dict = None, limit: int = 0) -> None release_status="public" if row.PublicRelease else "private", measuring_point_height=row.MPHeight, measuring_point_description=row.MeasuringPoint, - well_completed_on=well_completed_on, notes=( [{"content": row.Notes, "note_type": "Other"}] if row.Notes else [] ), @@ -297,7 +283,6 @@ def transfer_wells(session: Session, flags: dict = None, limit: int = 0) -> None well_data["nma_pk_welldata"] = row.WellID well_data.pop("notes") - # well_completed_on is kept in well_data (not excluded above) well = Thing(**well_data) session.add(well) # logger.info(f"Created well for {row.PointID}") From f0112264cb71eafeb36fca6363b3d9fc21e69ee5 Mon Sep 17 00:00:00 2001 From: kbighorse Date: Wed, 3 Dec 2025 08:55:09 +0000 Subject: [PATCH 40/66] Formatting changes --- schemas/location.py | 4 +--- tests/features/steps/post_migration_legacy_data.py | 4 +--- transfers/util.py | 4 +--- 3 files changed, 3 insertions(+), 9 deletions(-) diff --git a/schemas/location.py b/schemas/location.py index f34c54115..ca182ebd5 100644 --- a/schemas/location.py +++ b/schemas/location.py @@ -155,9 +155,7 @@ def populate_fields(cls, data: Any) -> Any: data_dict["properties"]["elevation"] = convert_m_to_ft(elevation_m) data_dict["properties"]["elevation_method"] = data_dict.get("elevation_method") # populate AMPAPI date fields - data_dict["properties"]["nma_date_created"] = data_dict.get( - "nma_date_created" - ) + data_dict["properties"]["nma_date_created"] = data_dict.get("nma_date_created") data_dict["properties"]["nma_site_date"] = data_dict.get("nma_site_date") # populate UTM coordinates diff --git a/tests/features/steps/post_migration_legacy_data.py b/tests/features/steps/post_migration_legacy_data.py index 99ddd028e..7c2c36ffe 100644 --- a/tests/features/steps/post_migration_legacy_data.py +++ b/tests/features/steps/post_migration_legacy_data.py @@ -195,9 +195,7 @@ def step_when_filter_locations(context: Context, start_date: str, end_date: str) locations = ( session.query(Location) - .filter( - Location.nma_site_date >= start, Location.nma_site_date <= end - ) + .filter(Location.nma_site_date >= start, Location.nma_site_date <= end) .all() ) diff --git a/transfers/util.py b/transfers/util.py index c8d054a0a..5216c204f 100644 --- a/transfers/util.py +++ b/transfers/util.py @@ -253,9 +253,7 @@ def make_location(row: pd.Series, elevations: dict) -> tuple: nma_site_date = None if row.SiteDate: - nma_site_date = datetime.strptime( - row.SiteDate, "%Y-%m-%d %H:%M:%S.%f" - ).date() + nma_site_date = datetime.strptime(row.SiteDate, "%Y-%m-%d %H:%M:%S.%f").date() location = Location( nma_pk_location=row.LocationId, From f021c4be309fa69dd94fd2762e249cbc97b2e64d Mon Sep 17 00:00:00 2001 From: Kimball Bighorse Date: Wed, 3 Dec 2025 00:55:20 -0800 Subject: [PATCH 41/66] Replace `legacy_` prefix with `nma_` --- db/location.py | 6 +- schemas/location.py | 26 +-- ...st-migration-legacy-data-retrieval.feature | 70 ++++---- .../steps/post_migration_legacy_data.py | 164 +++++++++--------- tests/test_location.py | 24 +-- tests/test_transfer_legacy_dates.py | 64 +++---- transfers/util.py | 14 +- 7 files changed, 184 insertions(+), 184 deletions(-) diff --git a/db/location.py b/db/location.py index a07958346..c00c11a70 100644 --- a/db/location.py +++ b/db/location.py @@ -62,13 +62,13 @@ class Location(Base, AutoBaseMixin, ReleaseMixin, NotesMixin, DataProvenanceMixi nma_notes_location: Mapped[str] = mapped_column(Text, nullable=True) nma_coordinate_notes: Mapped[str] = mapped_column(Text, nullable=True) - # --- Legacy AMPAPI Date Fields (Migration-Only, Read-Only Post-Migration) --- - legacy_date_created: Mapped[datetime.date] = mapped_column( + # --- AMPAPI Date Fields (Migration-Only, Read-Only Post-Migration) --- + nma_date_created: Mapped[datetime.date] = mapped_column( Date, nullable=True, comment="Original AMPAPI DateCreated (migration-only field)", ) - legacy_site_date: Mapped[datetime.date] = mapped_column( + nma_site_date: Mapped[datetime.date] = mapped_column( Date, nullable=True, comment="Original AMPAPI SiteDate (migration-only field)" ) diff --git a/schemas/location.py b/schemas/location.py index 1f4bad472..f34c54115 100644 --- a/schemas/location.py +++ b/schemas/location.py @@ -107,9 +107,9 @@ class GeoJSONProperties(BaseModel): default_factory=GeoJSONUTMCoordinates ) notes: list[NoteResponse] = [] - # Legacy AMPAPI date fields (migration-only, read-only) - legacy_date_created: date | None = None - legacy_site_date: date | None = None + # AMPAPI date fields (migration-only, read-only) + nma_date_created: date | None = None + nma_site_date: date | None = None model_config = ConfigDict( from_attributes=True, @@ -154,11 +154,11 @@ def populate_fields(cls, data: Any) -> Any: data_dict["properties"]["notes"] = data_dict.get("notes") data_dict["properties"]["elevation"] = convert_m_to_ft(elevation_m) data_dict["properties"]["elevation_method"] = data_dict.get("elevation_method") - # populate legacy date fields - data_dict["properties"]["legacy_date_created"] = data_dict.get( - "legacy_date_created" + # populate AMPAPI date fields + data_dict["properties"]["nma_date_created"] = data_dict.get( + "nma_date_created" ) - data_dict["properties"]["legacy_site_date"] = data_dict.get("legacy_site_date") + data_dict["properties"]["nma_site_date"] = data_dict.get("nma_site_date") # populate UTM coordinates point_utm_zone_13n_wkt = transform_srid( @@ -190,9 +190,9 @@ class LocationResponse(BaseResponseModel): county: str | None quad_name: str | None - # Legacy AMPAPI date fields (migration-only, read-only post-migration) - legacy_date_created: date | None = None - legacy_site_date: date | None = None + # AMPAPI date fields (migration-only, read-only post-migration) + nma_date_created: date | None = None + nma_site_date: date | None = None @field_validator("point", mode="before") def point_to_wkt(cls, value): @@ -232,9 +232,9 @@ class UpdateLocation(BaseUpdateModel, ValidateLocation): coordinate_accuracy: float | None = None coordinate_method: CoordinateMethod | None = None - # Legacy AMPAPI date fields (migration-only, can be updated but not created) - legacy_date_created: date | None = None - legacy_site_date: date | None = None + # AMPAPI date fields (migration-only, can be updated but not created) + nma_date_created: date | None = None + nma_site_date: date | None = None # ============= EOF ============================================= diff --git a/tests/features/post-migration-legacy-data-retrieval.feature b/tests/features/post-migration-legacy-data-retrieval.feature index b5329ad9c..61f3e8c40 100644 --- a/tests/features/post-migration-legacy-data-retrieval.feature +++ b/tests/features/post-migration-legacy-data-retrieval.feature @@ -13,43 +13,43 @@ Feature: Post-Migration Legacy Data Retrieval Scenario: Retrieve location with both legacy dates via API Given a location exists with: | field | value | - | legacy_date_created | 2014-04-03 | - | legacy_site_date | 2002-12-10 | + | nma_date_created | 2014-04-03 | + | nma_site_date | 2002-12-10 | When I retrieve that location via the API - Then the response should include legacy_date_created as "2014-04-03" - And the response should include legacy_site_date as "2002-12-10" + Then the response should include nma_date_created as "2014-04-03" + And the response should include nma_site_date as "2002-12-10" And the time gap should be approximately 11.3 years Scenario: Retrieve location with large time gap (54 years) Given a location exists with: | field | value | - | legacy_date_created | 2008-05-28 | - | legacy_site_date | 1954-05-01 | + | nma_date_created | 2008-05-28 | + | nma_site_date | 1954-05-01 | When I retrieve that location via the API - Then the response should include legacy_date_created as "2008-05-28" - And the response should include legacy_site_date as "1954-05-01" + Then the response should include nma_date_created as "2008-05-28" + And the response should include nma_site_date as "1954-05-01" And the time gap should be approximately 54 years Scenario: List all locations includes legacy date fields Given 5 locations exist with various legacy dates When I GET /location to list all locations - Then each location should have a legacy_date_created field - And each location should have a legacy_site_date field - And some locations should have null legacy_site_date + Then each location should have a nma_date_created field + And each location should have a nma_site_date field + And some locations should have null nma_site_date Scenario: Filter locations by legacy site date range - Given locations exist with legacy_site_date ranging from 1950 to 2024 - When I filter locations where legacy_site_date is between "2000-01-01" and "2010-12-31" - Then the response should only include locations with legacy_site_date in that decade - And locations with legacy_site_date before 2000 should not be included - And locations with legacy_site_date after 2010 should not be included + Given locations exist with nma_site_date ranging from 1950 to 2024 + When I filter locations where nma_site_date is between "2000-01-01" and "2010-12-31" + Then the response should only include locations with nma_site_date in that decade + And locations with nma_site_date before 2000 should not be included + And locations with nma_site_date after 2010 should not be included - Scenario: Query location by legacy_date_created - Given 3 locations exist with legacy_date_created "2014-04-03" - And 2 locations exist with legacy_date_created "2017-12-06" - When I query for locations with legacy_date_created "2014-04-03" + Scenario: Query location by nma_date_created + Given 3 locations exist with nma_date_created "2014-04-03" + And 2 locations exist with nma_date_created "2017-12-06" + When I query for locations with nma_date_created "2014-04-03" Then the response should include exactly 3 locations - And all should have legacy_date_created "2014-04-03" + And all should have nma_date_created "2014-04-03" # Data Quality Validation @@ -57,8 +57,8 @@ Feature: Post-Migration Legacy Data Retrieval Given 100 locations were migrated And 9 of them had non-null SiteDate in AMPAPI When I query the migrated locations - Then 9% should have non-null legacy_site_date - And 100% should have non-null legacy_date_created + Then 9% should have non-null nma_site_date + And 100% should have non-null nma_date_created # Audit Trail Verification @@ -66,29 +66,29 @@ Feature: Post-Migration Legacy Data Retrieval Given a location was migrated with legacy dates When I retrieve that location Then it should have created_at (new system timestamp from migration) - And it should have legacy_date_created (original AMPAPI DateCreated) - And it should have legacy_site_date (original AMPAPI SiteDate) + And it should have nma_date_created (original AMPAPI DateCreated) + And it should have nma_site_date (original AMPAPI SiteDate) And all three timestamps should be independently queryable And created_at should be a recent timestamp - And legacy_date_created should be an older date + And nma_date_created should be an older date # Edge Cases Scenario: Location where SiteDate is later than DateCreated (data anomaly) Given a location exists with: | field | value | - | legacy_date_created | 2010-01-15 | - | legacy_site_date | 2015-06-20 | + | nma_date_created | 2010-01-15 | + | nma_site_date | 2015-06-20 | When I retrieve that location - Then legacy_date_created should be "2010-01-15" - And legacy_site_date should be "2015-06-20" + Then nma_date_created should be "2010-01-15" + And nma_site_date should be "2015-06-20" And the system should accept this without error - Scenario: Location with only legacy_date_created (no legacy_site_date) + Scenario: Location with only nma_date_created (no nma_site_date) Given a location exists with: | field | value | - | legacy_date_created | 2014-10-17 | - | legacy_site_date | null | + | nma_date_created | 2014-10-17 | + | nma_site_date | null | When I retrieve that location - Then legacy_date_created should be "2014-10-17" - And legacy_site_date should be null + Then nma_date_created should be "2014-10-17" + And nma_site_date should be null diff --git a/tests/features/steps/post_migration_legacy_data.py b/tests/features/steps/post_migration_legacy_data.py index 25e932159..99ddd028e 100644 --- a/tests/features/steps/post_migration_legacy_data.py +++ b/tests/features/steps/post_migration_legacy_data.py @@ -31,15 +31,15 @@ def parse_number(text): register_type(Number=parse_number) -def create_test_location(legacy_date_created=None, legacy_site_date=None): +def create_test_location(nma_date_created=None, nma_site_date=None): """Helper to create a test location with legacy dates.""" with session_ctx() as session: location = Location( point="POINT(-106.607784 35.118924)", elevation=1558.8, release_status="public", - legacy_date_created=legacy_date_created, - legacy_site_date=legacy_site_date, + nma_date_created=nma_date_created, + nma_site_date=nma_site_date, ) session.add(location) session.commit() @@ -58,19 +58,19 @@ def step_given_location_with_table(context: Context): """Create location with fields from table.""" data = {row["field"]: row["value"] for row in context.table} - legacy_date_created = ( - date.fromisoformat(data["legacy_date_created"]) - if data.get("legacy_date_created") and data["legacy_date_created"] != "null" + nma_date_created = ( + date.fromisoformat(data["nma_date_created"]) + if data.get("nma_date_created") and data["nma_date_created"] != "null" else None ) - legacy_site_date = ( - date.fromisoformat(data["legacy_site_date"]) - if data.get("legacy_site_date") and data["legacy_site_date"] != "null" + nma_site_date = ( + date.fromisoformat(data["nma_site_date"]) + if data.get("nma_site_date") and data["nma_site_date"] != "null" else None ) location = create_test_location( - legacy_date_created=legacy_date_created, legacy_site_date=legacy_site_date + nma_date_created=nma_date_created, nma_site_date=nma_site_date ) context.test_location = location @@ -93,33 +93,33 @@ def step_given_multiple_locations(context: Context, count: int): for i in range(min(count, len(test_data))): legacy_date, site_date = test_data[i] location = create_test_location( - legacy_date_created=date.fromisoformat(legacy_date), - legacy_site_date=(date.fromisoformat(site_date) if site_date else None), + nma_date_created=date.fromisoformat(legacy_date), + nma_site_date=(date.fromisoformat(site_date) if site_date else None), ) context.test_locations.append(location) @given( - "locations exist with legacy_site_date ranging from {start_year:Number} to {end_year:Number}" + "locations exist with nma_site_date ranging from {start_year:Number} to {end_year:Number}" ) def step_given_locations_date_range(context: Context, start_year: int, end_year: int): - """Create locations with legacy_site_date across a date range.""" + """Create locations with nma_site_date across a date range.""" context.test_locations = [] years = [1954, 2002, 2003, 2010, 2015, 2020, 2024] for year in years: location = create_test_location( - legacy_date_created=date(year + 5, 1, 1), # Always 5 years after site date - legacy_site_date=date(year, 6, 15), + nma_date_created=date(year + 5, 1, 1), # Always 5 years after site date + nma_site_date=date(year, 6, 15), ) context.test_locations.append(location) -@given('{count:Number} locations exist with legacy_date_created "{target_date}"') +@given('{count:Number} locations exist with nma_date_created "{target_date}"') def step_given_locations_with_specific_date( context: Context, count: int, target_date: str ): - """Create locations with specific legacy_date_created.""" + """Create locations with specific nma_date_created.""" if not hasattr(context, "test_locations"): context.test_locations = [] @@ -127,8 +127,8 @@ def step_given_locations_with_specific_date( for i in range(count): location = create_test_location( - legacy_date_created=target, - legacy_site_date=date(2000 + i, 1, 1), # Vary the site dates + nma_date_created=target, + nma_site_date=date(2000 + i, 1, 1), # Vary the site dates ) context.test_locations.append(location) @@ -139,12 +139,12 @@ def step_given_count_locations_migrated(context: Context, count: int): context.test_locations = [] for i in range(count): - # 9% have legacy_site_date + # 9% have nma_site_date has_site_date = i < count * 0.09 location = create_test_location( - legacy_date_created=date(2014, 1, i % 28 + 1), - legacy_site_date=date(2003, 1, i % 28 + 1) if has_site_date else None, + nma_date_created=date(2014, 1, i % 28 + 1), + nma_site_date=date(2003, 1, i % 28 + 1) if has_site_date else None, ) context.test_locations.append(location) @@ -159,7 +159,7 @@ def step_given_sitedate_count(context: Context, count: int): def step_given_location_migrated_with_dates(context: Context): """Create location with both legacy dates.""" location = create_test_location( - legacy_date_created=date(2014, 4, 3), legacy_site_date=date(2002, 12, 10) + nma_date_created=date(2014, 4, 3), nma_site_date=date(2002, 12, 10) ) context.test_location = location @@ -184,7 +184,7 @@ def step_when_get_all_locations(context: Context): @when( - 'I filter locations where legacy_site_date is between "{start_date}" and "{end_date}"' + 'I filter locations where nma_site_date is between "{start_date}" and "{end_date}"' ) def step_when_filter_locations(context: Context, start_date: str, end_date: str): """Filter locations by date range.""" @@ -196,7 +196,7 @@ def step_when_filter_locations(context: Context, start_date: str, end_date: str) locations = ( session.query(Location) .filter( - Location.legacy_site_date >= start, Location.legacy_site_date <= end + Location.nma_site_date >= start, Location.nma_site_date <= end ) .all() ) @@ -204,13 +204,13 @@ def step_when_filter_locations(context: Context, start_date: str, end_date: str) context.filtered_locations = locations -@when('I query for locations with legacy_date_created "{target_date}"') +@when('I query for locations with nma_date_created "{target_date}"') def step_when_query_by_legacy_date(context: Context, target_date: str): - """Query locations by legacy_date_created.""" + """Query locations by nma_date_created.""" with session_ctx() as session: target = date.fromisoformat(target_date) locations = ( - session.query(Location).filter(Location.legacy_date_created == target).all() + session.query(Location).filter(Location.nma_date_created == target).all() ) context.queried_locations = locations @@ -236,25 +236,25 @@ def step_when_retrieve_location(context: Context): # THEN steps -@then('the response should include legacy_date_created as "{expected_date}"') -def step_then_legacy_date_created(context: Context, expected_date: str): - """Assert legacy_date_created matches.""" - actual = context.location_response.get("legacy_date_created") +@then('the response should include nma_date_created as "{expected_date}"') +def step_then_nma_date_created(context: Context, expected_date: str): + """Assert nma_date_created matches.""" + actual = context.location_response.get("nma_date_created") assert actual == expected_date, f"Expected {expected_date}, got {actual}" -@then('the response should include legacy_site_date as "{expected_date}"') -def step_then_legacy_site_date(context: Context, expected_date: str): - """Assert legacy_site_date matches.""" - actual = context.location_response.get("legacy_site_date") +@then('the response should include nma_site_date as "{expected_date}"') +def step_then_nma_site_date(context: Context, expected_date: str): + """Assert nma_site_date matches.""" + actual = context.location_response.get("nma_site_date") assert actual == expected_date, f"Expected {expected_date}, got {actual}" @then("the time gap should be approximately {years} years") def step_then_time_gap_years(context: Context, years: str): """Assert approximate year gap.""" - legacy_str = context.location_response.get("legacy_date_created") - site_date_str = context.location_response.get("legacy_site_date") + legacy_str = context.location_response.get("nma_date_created") + site_date_str = context.location_response.get("nma_site_date") if not legacy_str or not site_date_str: raise AssertionError("Missing date fields for gap calculation") @@ -272,28 +272,28 @@ def step_then_time_gap_years(context: Context, years: str): ), f"Expected ~{expected_years} year gap, got {gap_years:.1f} years" -@then("each location should have a legacy_date_created field") +@then("each location should have a nma_date_created field") def step_then_all_have_legacy_field(context: Context): """Assert all locations have the field.""" items = context.locations_response.get("items", []) for item in items: - assert "legacy_date_created" in item, f"Location missing legacy_date_created" + assert "nma_date_created" in item, f"Location missing nma_date_created" -@then("each location should have a legacy_site_date field") +@then("each location should have a nma_site_date field") def step_then_all_have_site_date_field(context: Context): """Assert all locations have the field.""" items = context.locations_response.get("items", []) for item in items: - assert "legacy_site_date" in item, f"Location missing legacy_site_date" + assert "nma_site_date" in item, f"Location missing nma_site_date" -@then("some locations should have null legacy_site_date") +@then("some locations should have null nma_site_date") def step_then_some_null_site_date(context: Context): """Assert some locations have null.""" items = context.locations_response.get("items", []) - null_count = sum(1 for item in items if item.get("legacy_site_date") is None) - assert null_count > 0, "Expected at least one location with null legacy_site_date" + null_count = sum(1 for item in items if item.get("nma_site_date") is None) + assert null_count > 0, "Expected at least one location with null nma_site_date" @then("the response should only include locations with site date in that decade") @@ -301,8 +301,8 @@ def step_then_locations_in_decade(context: Context): """Assert filtered locations are in range.""" for loc in context.filtered_locations: assert ( - 2000 <= loc.legacy_site_date.year <= 2010 - ), f"Location not in 2000-2010: {loc.legacy_site_date}" + 2000 <= loc.nma_site_date.year <= 2010 + ), f"Location not in 2000-2010: {loc.nma_site_date}" @then("locations with site date before {year:Number} should not be included") @@ -310,8 +310,8 @@ def step_then_locations_before_excluded(context: Context, year: int): """Assert no locations before year.""" for loc in context.filtered_locations: assert ( - loc.legacy_site_date.year >= year - ), f"Location from {loc.legacy_site_date.year} should not be included" + loc.nma_site_date.year >= year + ), f"Location from {loc.nma_site_date.year} should not be included" @then("locations with site date after {year:Number} should not be included") @@ -319,8 +319,8 @@ def step_then_locations_after_excluded(context: Context, year: int): """Assert no locations after year.""" for loc in context.filtered_locations: assert ( - loc.legacy_site_date.year <= year - ), f"Location from {loc.legacy_site_date.year} should not be included" + loc.nma_site_date.year <= year + ), f"Location from {loc.nma_site_date.year} should not be included" @then("the response should include exactly {count:Number} locations") @@ -330,21 +330,21 @@ def step_then_exact_count_locations(context: Context, count: int): assert actual == count, f"Expected {count} locations, got {actual}" -@then('all should have legacy_date_created "{expected_date}"') +@then('all should have nma_date_created "{expected_date}"') def step_then_all_have_date(context: Context, expected_date: str): """Assert all have same date.""" expected = date.fromisoformat(expected_date) for loc in context.queried_locations: assert ( - loc.legacy_date_created == expected - ), f"Location has {loc.legacy_date_created}, expected {expected}" + loc.nma_date_created == expected + ), f"Location has {loc.nma_date_created}, expected {expected}" -@then("{percentage:Number}% should have non-null legacy_site_date") +@then("{percentage:Number}% should have non-null nma_site_date") def step_then_percentage_site_date(context: Context, percentage: int): - """Assert percentage with legacy_site_date.""" + """Assert percentage with nma_site_date.""" total = len(context.queried_locations) - populated = sum(1 for loc in context.queried_locations if loc.legacy_site_date) + populated = sum(1 for loc in context.queried_locations if loc.nma_site_date) actual_pct = (populated / total) * 100 tolerance = 2 @@ -353,11 +353,11 @@ def step_then_percentage_site_date(context: Context, percentage: int): ), f"Expected ~{percentage}%, got {actual_pct:.1f}%" -@then("{percentage:Number}% should have non-null legacy_date_created") +@then("{percentage:Number}% should have non-null nma_date_created") def step_then_percentage_legacy(context: Context, percentage: int): - """Assert percentage with legacy_date_created.""" + """Assert percentage with nma_date_created.""" total = len(context.queried_locations) - populated = sum(1 for loc in context.queried_locations if loc.legacy_date_created) + populated = sum(1 for loc in context.queried_locations if loc.nma_date_created) actual_pct = (populated / total) * 100 tolerance = 2 @@ -372,24 +372,24 @@ def step_then_has_created_at(context: Context): assert context.retrieved_location.created_at is not None -@then("it should have legacy_date_created (original AMPAPI DateCreated)") +@then("it should have nma_date_created (original AMPAPI DateCreated)") def step_then_has_legacy_date(context: Context): - """Assert legacy_date_created exists.""" - assert context.retrieved_location.legacy_date_created is not None + """Assert nma_date_created exists.""" + assert context.retrieved_location.nma_date_created is not None -@then("it should have legacy_site_date (original AMPAPI SiteDate)") +@then("it should have nma_site_date (original AMPAPI SiteDate)") def step_then_has_site_date(context: Context): - """Assert legacy_site_date exists.""" - assert context.retrieved_location.legacy_site_date is not None + """Assert nma_site_date exists.""" + assert context.retrieved_location.nma_site_date is not None @then("all three timestamps should be independently queryable") def step_then_all_queryable(context: Context): """Assert all fields are queryable.""" assert hasattr(context.retrieved_location, "created_at") - assert hasattr(context.retrieved_location, "legacy_date_created") - assert hasattr(context.retrieved_location, "legacy_site_date") + assert hasattr(context.retrieved_location, "nma_date_created") + assert hasattr(context.retrieved_location, "nma_site_date") @then("created_at should be a recent timestamp") @@ -401,25 +401,25 @@ def step_then_created_at_recent(context: Context): assert diff_seconds < 3600, "created_at should be within last hour" -@then("legacy_date_created should be an older date") +@then("nma_date_created should be an older date") def step_then_legacy_date_older(context: Context): - """Assert legacy_date_created is old.""" - legacy_date = context.retrieved_location.legacy_date_created - assert legacy_date.year < 2024, "legacy_date_created should be from the past" + """Assert nma_date_created is old.""" + legacy_date = context.retrieved_location.nma_date_created + assert legacy_date.year < 2024, "nma_date_created should be from the past" -@then('legacy_date_created should be "{expected_date}"') +@then('nma_date_created should be "{expected_date}"') def step_then_legacy_is(context: Context, expected_date: str): - """Assert legacy_date_created value.""" - actual = context.retrieved_location.legacy_date_created + """Assert nma_date_created value.""" + actual = context.retrieved_location.nma_date_created expected = date.fromisoformat(expected_date) assert actual == expected, f"Expected {expected}, got {actual}" -@then('legacy_site_date should be "{expected_date}"') +@then('nma_site_date should be "{expected_date}"') def step_then_site_date_is(context: Context, expected_date: str): - """Assert legacy_site_date value.""" - actual = context.retrieved_location.legacy_site_date + """Assert nma_site_date value.""" + actual = context.retrieved_location.nma_site_date expected = date.fromisoformat(expected_date) assert actual == expected, f"Expected {expected}, got {actual}" @@ -431,10 +431,10 @@ def step_then_no_error(context: Context): pass -@then("legacy_site_date should be null") +@then("nma_site_date should be null") def step_then_site_date_null(context: Context): - """Assert legacy_site_date is null.""" - assert context.retrieved_location.legacy_site_date is None + """Assert nma_site_date is null.""" + assert context.retrieved_location.nma_site_date is None @then("the well should still be valid") diff --git a/tests/test_location.py b/tests/test_location.py index b86211a58..67a4615c8 100644 --- a/tests/test_location.py +++ b/tests/test_location.py @@ -251,10 +251,10 @@ def test_new_location_has_null_legacy_fields(): data = response.json() assert "id" in data # Legacy fields should be present in response but null (not set during creation) - assert "legacy_date_created" in data - assert "legacy_site_date" in data - assert data["legacy_date_created"] is None - assert data["legacy_site_date"] is None + assert "nma_date_created" in data + assert "nma_site_date" in data + assert data["nma_date_created"] is None + assert data["nma_site_date"] is None # cleanup after test cleanup_post_test(Location, data["id"]) @@ -278,10 +278,10 @@ def test_legacy_fields_present_in_location_response(): data = get_response.json() # Verify fields exist in response (even if null) - assert "legacy_date_created" in data - assert "legacy_site_date" in data - assert data["legacy_date_created"] is None - assert data["legacy_site_date"] is None + assert "nma_date_created" in data + assert "nma_site_date" in data + assert data["nma_date_created"] is None + assert data["nma_site_date"] is None # cleanup after test cleanup_post_test(Location, location_id) @@ -303,12 +303,12 @@ def test_legacy_fields_independent_of_created_at(): assert "created_at" in data assert data["created_at"] is not None - # legacy_date_created is separate and null for new records - assert "legacy_date_created" in data - assert data["legacy_date_created"] is None + # nma_date_created is separate and null for new records + assert "nma_date_created" in data + assert data["nma_date_created"] is None # These are independent fields with different purposes - assert "created_at" != "legacy_date_created" + assert "created_at" != "nma_date_created" # cleanup after test cleanup_post_test(Location, data["id"]) diff --git a/tests/test_transfer_legacy_dates.py b/tests/test_transfer_legacy_dates.py index 795820ec8..5068d8882 100644 --- a/tests/test_transfer_legacy_dates.py +++ b/tests/test_transfer_legacy_dates.py @@ -17,8 +17,8 @@ Unit tests for legacy date field population during AMPAPI → NMSampleLocations migration. These tests verify that: -1. Location.legacy_date_created is populated from CSV DateCreated -2. Location.legacy_site_date is populated from CSV SiteDate (if not null) +1. Location.nma_date_created is populated from CSV DateCreated +2. Location.nma_site_date is populated from CSV SiteDate (if not null) """ import datetime from unittest.mock import Mock, patch, MagicMock @@ -35,7 +35,7 @@ @patch("transfers.util.lexicon_mapper") def test_make_location_with_both_legacy_dates(mock_lexicon_mapper): - """Test that make_location populates both legacy_date_created and legacy_site_date""" + """Test that make_location populates both nma_date_created and nma_site_date""" # Mock lexicon mapper to avoid GCS calls mock_lexicon_mapper.map_value.return_value = "GPS" @@ -63,13 +63,13 @@ def test_make_location_with_both_legacy_dates(mock_lexicon_mapper): # Call make_location location, elevation_method = make_location(row, elevations) - # Verify legacy_date_created is set from DateCreated - assert location.legacy_date_created is not None - assert location.legacy_date_created == datetime.date(2014, 4, 3) + # Verify nma_date_created is set from DateCreated + assert location.nma_date_created is not None + assert location.nma_date_created == datetime.date(2014, 4, 3) - # Verify legacy_site_date is set from SiteDate - assert location.legacy_site_date is not None - assert location.legacy_site_date == datetime.date(2002, 12, 10) + # Verify nma_site_date is set from SiteDate + assert location.nma_site_date is not None + assert location.nma_site_date == datetime.date(2002, 12, 10) # Verify created_at is NOT set during migration (it's auto-set by AutoBaseMixin on save) assert location.created_at is None @@ -102,11 +102,11 @@ def test_make_location_with_only_date_created(mock_lexicon_mapper): elevations = {} location, elevation_method = make_location(row, elevations) - # Verify legacy_date_created is set - assert location.legacy_date_created == datetime.date(2014, 4, 3) + # Verify nma_date_created is set + assert location.nma_date_created == datetime.date(2014, 4, 3) - # Verify legacy_site_date is null (91% of locations don't have SiteDate) - assert location.legacy_site_date is None + # Verify nma_site_date is null (91% of locations don't have SiteDate) + assert location.nma_site_date is None @patch("transfers.util.lexicon_mapper") @@ -137,8 +137,8 @@ def test_make_location_with_site_date_later_than_date_created(mock_lexicon_mappe location, elevation_method = make_location(row, elevations) # Both dates should be preserved as-is, regardless of order - assert location.legacy_date_created == datetime.date(2010, 1, 15) - assert location.legacy_site_date == datetime.date(2015, 6, 20) + assert location.nma_date_created == datetime.date(2010, 1, 15) + assert location.nma_site_date == datetime.date(2015, 6, 20) @patch("transfers.util.lexicon_mapper") @@ -169,11 +169,11 @@ def test_make_location_with_very_old_site_date(mock_lexicon_mapper): location, elevation_method = make_location(row, elevations) # Verify very old date is preserved - assert location.legacy_site_date == datetime.date(1954, 5, 1) - assert location.legacy_date_created == datetime.date(2008, 5, 28) + assert location.nma_site_date == datetime.date(1954, 5, 1) + assert location.nma_date_created == datetime.date(2008, 5, 28) # Verify 54-year time gap - time_gap = (location.legacy_date_created - location.legacy_site_date).days + time_gap = (location.nma_date_created - location.nma_site_date).days assert time_gap == 19751 # Approximately 54 years @@ -205,15 +205,15 @@ def test_make_location_legacy_dates_are_date_not_datetime(mock_lexicon_mapper): location, elevation_method = make_location(row, elevations) # Verify they are date objects (not datetime) - assert isinstance(location.legacy_date_created, datetime.date) - assert not isinstance(location.legacy_date_created, datetime.datetime) + assert isinstance(location.nma_date_created, datetime.date) + assert not isinstance(location.nma_date_created, datetime.datetime) - assert isinstance(location.legacy_site_date, datetime.date) - assert not isinstance(location.legacy_site_date, datetime.datetime) + assert isinstance(location.nma_site_date, datetime.date) + assert not isinstance(location.nma_site_date, datetime.datetime) # Verify time component is stripped - assert location.legacy_date_created == datetime.date(2014, 4, 3) - assert location.legacy_site_date == datetime.date(2002, 12, 10) + assert location.nma_date_created == datetime.date(2014, 4, 3) + assert location.nma_site_date == datetime.date(2002, 12, 10) @patch("transfers.util.lexicon_mapper") @@ -247,12 +247,12 @@ def test_make_location_legacy_dates_independent_of_created_at(mock_lexicon_mappe assert location.created_at is None # legacy fields should be Date (no timezone) - assert isinstance(location.legacy_date_created, datetime.date) - assert isinstance(location.legacy_site_date, datetime.date) + assert isinstance(location.nma_date_created, datetime.date) + assert isinstance(location.nma_site_date, datetime.date) # Legacy fields should be populated - assert location.legacy_date_created is not None - assert location.legacy_site_date is not None + assert location.nma_date_created is not None + assert location.nma_site_date is not None # ============================================================================ @@ -312,15 +312,15 @@ def test_location_legacy_date_coverage_statistics(mock_lexicon_mapper): location, _ = make_location(row, elevations) # Count coverage - if location.legacy_date_created is not None: + if location.nma_date_created is not None: locations_created += 1 - if location.legacy_site_date is not None: + if location.nma_site_date is not None: locations_with_site_date += 1 # Verify expected coverage - assert locations_created == 100 # 100% should have legacy_date_created - assert locations_with_site_date == 9 # 9% should have legacy_site_date + assert locations_created == 100 # 100% should have nma_date_created + assert locations_with_site_date == 9 # 9% should have nma_site_date # ============================================================================ diff --git a/transfers/util.py b/transfers/util.py index d39845f44..c8d054a0a 100644 --- a/transfers/util.py +++ b/transfers/util.py @@ -244,16 +244,16 @@ def make_location(row: pd.Series, elevations: dict) -> tuple: f"LU_AltitudeMethod:{row.AltitudeMethod.strip()}" ) - # Extract legacy date fields (Date type, not DateTime) - legacy_date_created = None + # Extract AMPAPI date fields (Date type, not DateTime) + nma_date_created = None if row.DateCreated: - legacy_date_created = datetime.strptime( + nma_date_created = datetime.strptime( row.DateCreated, "%Y-%m-%d %H:%M:%S.%f" ).date() - legacy_site_date = None + nma_site_date = None if row.SiteDate: - legacy_site_date = datetime.strptime( + nma_site_date = datetime.strptime( row.SiteDate, "%Y-%m-%d %H:%M:%S.%f" ).date() @@ -264,8 +264,8 @@ def make_location(row: pd.Series, elevations: dict) -> tuple: release_status="public" if row.PublicRelease else "private", nma_coordinate_notes=row.CoordinateNotes, nma_notes_location=row.LocationNotes, - legacy_date_created=legacy_date_created, - legacy_site_date=legacy_site_date, + nma_date_created=nma_date_created, + nma_site_date=nma_site_date, ) return location, elevation_method From 2e33f83842886b903f4a9c6481f656e9b5424af5 Mon Sep 17 00:00:00 2001 From: Kimball Bighorse Date: Wed, 3 Dec 2025 01:02:08 -0800 Subject: [PATCH 42/66] Remove legacy fields from `UpdateLocation` schema --- db/location.py | 4 ++-- schemas/location.py | 8 ++------ ...st-migration-legacy-data-retrieval.feature | 20 +++++++++---------- .../steps/post_migration_legacy_data.py | 2 +- tests/test_location.py | 14 ++++++------- tests/test_transfer_legacy_dates.py | 8 ++++---- 6 files changed, 26 insertions(+), 30 deletions(-) diff --git a/db/location.py b/db/location.py index c00c11a70..cef3d0857 100644 --- a/db/location.py +++ b/db/location.py @@ -66,10 +66,10 @@ class Location(Base, AutoBaseMixin, ReleaseMixin, NotesMixin, DataProvenanceMixi nma_date_created: Mapped[datetime.date] = mapped_column( Date, nullable=True, - comment="Original AMPAPI DateCreated (migration-only field)", + comment="Original AMPAPI DateCreated (read-only, populated only during migration)", ) nma_site_date: Mapped[datetime.date] = mapped_column( - Date, nullable=True, comment="Original AMPAPI SiteDate (migration-only field)" + Date, nullable=True, comment="Original AMPAPI SiteDate (read-only, populated only during migration)" ) # --- Relationship Definitions --- diff --git a/schemas/location.py b/schemas/location.py index f34c54115..fce13ef99 100644 --- a/schemas/location.py +++ b/schemas/location.py @@ -107,7 +107,7 @@ class GeoJSONProperties(BaseModel): default_factory=GeoJSONUTMCoordinates ) notes: list[NoteResponse] = [] - # AMPAPI date fields (migration-only, read-only) + # AMPAPI date fields (read-only, populated only during migration) nma_date_created: date | None = None nma_site_date: date | None = None @@ -190,7 +190,7 @@ class LocationResponse(BaseResponseModel): county: str | None quad_name: str | None - # AMPAPI date fields (migration-only, read-only post-migration) + # AMPAPI date fields (read-only, populated only during migration, not in Create/Update schemas) nma_date_created: date | None = None nma_site_date: date | None = None @@ -232,9 +232,5 @@ class UpdateLocation(BaseUpdateModel, ValidateLocation): coordinate_accuracy: float | None = None coordinate_method: CoordinateMethod | None = None - # AMPAPI date fields (migration-only, can be updated but not created) - nma_date_created: date | None = None - nma_site_date: date | None = None - # ============= EOF ============================================= diff --git a/tests/features/post-migration-legacy-data-retrieval.feature b/tests/features/post-migration-legacy-data-retrieval.feature index 61f3e8c40..99fd08190 100644 --- a/tests/features/post-migration-legacy-data-retrieval.feature +++ b/tests/features/post-migration-legacy-data-retrieval.feature @@ -1,16 +1,16 @@ -Feature: Post-Migration Legacy Data Retrieval +Feature: Post-Migration AMPAPI Date Field Retrieval As a data manager After migrating data from AMPAPI to NMSampleLocations - I want to verify that all legacy temporal information is preserved and queryable + I want to verify that all AMPAPI temporal information is preserved and queryable So that no historical context is lost Background: Given a functioning api And the AMPAPI data has been migrated to the database - # Location Legacy Date Lookups + # Location AMPAPI Date Lookups (Read-Only Fields) - Scenario: Retrieve location with both legacy dates via API + Scenario: Retrieve location with both AMPAPI date fields via API Given a location exists with: | field | value | | nma_date_created | 2014-04-03 | @@ -30,14 +30,14 @@ Feature: Post-Migration Legacy Data Retrieval And the response should include nma_site_date as "1954-05-01" And the time gap should be approximately 54 years - Scenario: List all locations includes legacy date fields - Given 5 locations exist with various legacy dates + Scenario: List all locations includes AMPAPI date fields + Given 5 locations exist with various AMPAPI dates When I GET /location to list all locations Then each location should have a nma_date_created field And each location should have a nma_site_date field And some locations should have null nma_site_date - Scenario: Filter locations by legacy site date range + Scenario: Filter locations by AMPAPI site date range Given locations exist with nma_site_date ranging from 1950 to 2024 When I filter locations where nma_site_date is between "2000-01-01" and "2010-12-31" Then the response should only include locations with nma_site_date in that decade @@ -53,7 +53,7 @@ Feature: Post-Migration Legacy Data Retrieval # Data Quality Validation - Scenario: Verify migration preserved expected percentage of legacy dates + Scenario: Verify migration preserved expected percentage of AMPAPI dates Given 100 locations were migrated And 9 of them had non-null SiteDate in AMPAPI When I query the migrated locations @@ -62,8 +62,8 @@ Feature: Post-Migration Legacy Data Retrieval # Audit Trail Verification - Scenario: Legacy dates preserved alongside audit timestamps - Given a location was migrated with legacy dates + Scenario: AMPAPI dates preserved alongside audit timestamps + Given a location was migrated with AMPAPI dates When I retrieve that location Then it should have created_at (new system timestamp from migration) And it should have nma_date_created (original AMPAPI DateCreated) diff --git a/tests/features/steps/post_migration_legacy_data.py b/tests/features/steps/post_migration_legacy_data.py index 99ddd028e..d568c0296 100644 --- a/tests/features/steps/post_migration_legacy_data.py +++ b/tests/features/steps/post_migration_legacy_data.py @@ -32,7 +32,7 @@ def parse_number(text): def create_test_location(nma_date_created=None, nma_site_date=None): - """Helper to create a test location with legacy dates.""" + """Helper to create a test location with AMPAPI date fields (read-only post-migration).""" with session_ctx() as session: location = Location( point="POINT(-106.607784 35.118924)", diff --git a/tests/test_location.py b/tests/test_location.py index 67a4615c8..6e143f1eb 100644 --- a/tests/test_location.py +++ b/tests/test_location.py @@ -250,7 +250,7 @@ def test_new_location_has_null_legacy_fields(): assert response.status_code == 201 data = response.json() assert "id" in data - # Legacy fields should be present in response but null (not set during creation) + # AMPAPI date fields should be present in response but null (not set during creation, read-only) assert "nma_date_created" in data assert "nma_site_date" in data assert data["nma_date_created"] is None @@ -261,8 +261,8 @@ def test_new_location_has_null_legacy_fields(): def test_legacy_fields_present_in_location_response(): - """Test that legacy fields are included in location GET response""" - # Create a new location (without legacy fields) + """Test that AMPAPI date fields (read-only) are included in location GET response""" + # Create a new location (without AMPAPI date fields set - they're read-only) payload = { "point": "POINT (-106.607784 35.118924)", "elevation": 1558.8, @@ -272,12 +272,12 @@ def test_legacy_fields_present_in_location_response(): assert create_response.status_code == 201 location_id = create_response.json()["id"] - # Retrieve the location and verify legacy fields are in the schema + # Retrieve the location and verify AMPAPI date fields are in the schema get_response = client.get(f"/location/{location_id}") assert get_response.status_code == 200 data = get_response.json() - # Verify fields exist in response (even if null) + # Verify read-only fields exist in response (even if null) assert "nma_date_created" in data assert "nma_site_date" in data assert data["nma_date_created"] is None @@ -288,7 +288,7 @@ def test_legacy_fields_present_in_location_response(): def test_legacy_fields_independent_of_created_at(): - """Test that created_at (system timestamp) is separate from legacy fields""" + """Test that created_at (system timestamp) is separate from AMPAPI date fields (read-only)""" payload = { "point": "POINT (-106.607784 35.118924)", "elevation": 1558.8, @@ -303,7 +303,7 @@ def test_legacy_fields_independent_of_created_at(): assert "created_at" in data assert data["created_at"] is not None - # nma_date_created is separate and null for new records + # nma_date_created is separate and null for new records (read-only, populated only during migration) assert "nma_date_created" in data assert data["nma_date_created"] is None diff --git a/tests/test_transfer_legacy_dates.py b/tests/test_transfer_legacy_dates.py index 5068d8882..c4e06755f 100644 --- a/tests/test_transfer_legacy_dates.py +++ b/tests/test_transfer_legacy_dates.py @@ -14,11 +14,11 @@ # limitations under the License. # =============================================================================== """ -Unit tests for legacy date field population during AMPAPI → NMSampleLocations migration. +Unit tests for AMPAPI date field population during AMPAPI → NMSampleLocations migration. These tests verify that: -1. Location.nma_date_created is populated from CSV DateCreated -2. Location.nma_site_date is populated from CSV SiteDate (if not null) +1. Location.nma_date_created is populated from CSV DateCreated (read-only post-migration) +2. Location.nma_site_date is populated from CSV SiteDate if not null (read-only post-migration) """ import datetime from unittest.mock import Mock, patch, MagicMock @@ -29,7 +29,7 @@ # ============================================================================ -# LOCATION LEGACY DATE TESTS +# LOCATION AMPAPI DATE TESTS (Read-Only Post-Migration) # ============================================================================ From aef077b0f8e45ccbdcb8e7247128c053f329ff8f Mon Sep 17 00:00:00 2001 From: kbighorse Date: Wed, 3 Dec 2025 09:02:10 +0000 Subject: [PATCH 43/66] Formatting changes --- db/location.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/db/location.py b/db/location.py index cef3d0857..fda4611f9 100644 --- a/db/location.py +++ b/db/location.py @@ -69,7 +69,9 @@ class Location(Base, AutoBaseMixin, ReleaseMixin, NotesMixin, DataProvenanceMixi comment="Original AMPAPI DateCreated (read-only, populated only during migration)", ) nma_site_date: Mapped[datetime.date] = mapped_column( - Date, nullable=True, comment="Original AMPAPI SiteDate (read-only, populated only during migration)" + Date, + nullable=True, + comment="Original AMPAPI SiteDate (read-only, populated only during migration)", ) # --- Relationship Definitions --- From 6258e7de97d367e4b1c2814113457f902219b85a Mon Sep 17 00:00:00 2001 From: Kimball Bighorse Date: Wed, 3 Dec 2025 01:06:08 -0800 Subject: [PATCH 44/66] DRY up the mock lexicon mapper into a fixture --- tests/test_transfer_legacy_dates.py | 46 ++++++++++------------------- 1 file changed, 16 insertions(+), 30 deletions(-) diff --git a/tests/test_transfer_legacy_dates.py b/tests/test_transfer_legacy_dates.py index c4e06755f..badaec8b2 100644 --- a/tests/test_transfer_legacy_dates.py +++ b/tests/test_transfer_legacy_dates.py @@ -28,16 +28,26 @@ from transfers.util import make_location +# ============================================================================ +# FIXTURES +# ============================================================================ + + +@pytest.fixture +def mock_lexicon_mapper(): + """Fixture to mock lexicon_mapper for all transfer tests""" + with patch("transfers.util.lexicon_mapper") as mock: + mock.map_value.return_value = "GPS" + yield mock + + # ============================================================================ # LOCATION AMPAPI DATE TESTS (Read-Only Post-Migration) # ============================================================================ -@patch("transfers.util.lexicon_mapper") def test_make_location_with_both_legacy_dates(mock_lexicon_mapper): """Test that make_location populates both nma_date_created and nma_site_date""" - # Mock lexicon mapper to avoid GCS calls - mock_lexicon_mapper.map_value.return_value = "GPS" # Create a mock CSV row with both DateCreated and SiteDate row = pd.Series( @@ -75,12 +85,8 @@ def test_make_location_with_both_legacy_dates(mock_lexicon_mapper): assert location.created_at is None -@patch("transfers.util.lexicon_mapper") def test_make_location_with_only_date_created(mock_lexicon_mapper): """Test that make_location handles locations with only DateCreated (no SiteDate)""" - # Mock lexicon mapper to avoid GCS calls - mock_lexicon_mapper.map_value.return_value = "GPS" - row = pd.Series( { "PointID": "TEST-002", @@ -109,12 +115,8 @@ def test_make_location_with_only_date_created(mock_lexicon_mapper): assert location.nma_site_date is None -@patch("transfers.util.lexicon_mapper") def test_make_location_with_site_date_later_than_date_created(mock_lexicon_mapper): """Test data anomaly: SiteDate is later than DateCreated (should still be accepted)""" - # Mock lexicon mapper to avoid GCS calls - mock_lexicon_mapper.map_value.return_value = "GPS" - row = pd.Series( { "PointID": "TEST-003", @@ -141,12 +143,8 @@ def test_make_location_with_site_date_later_than_date_created(mock_lexicon_mappe assert location.nma_site_date == datetime.date(2015, 6, 20) -@patch("transfers.util.lexicon_mapper") def test_make_location_with_very_old_site_date(mock_lexicon_mapper): """Test that very old SiteDates (1950s) are preserved correctly""" - # Mock lexicon mapper to avoid GCS calls - mock_lexicon_mapper.map_value.return_value = "GPS" - row = pd.Series( { "PointID": "SM-0227", # Real example from dataset @@ -177,12 +175,8 @@ def test_make_location_with_very_old_site_date(mock_lexicon_mapper): assert time_gap == 19751 # Approximately 54 years -@patch("transfers.util.lexicon_mapper") def test_make_location_legacy_dates_are_date_not_datetime(mock_lexicon_mapper): - """Test that legacy date fields are Date type (not DateTime)""" - # Mock lexicon mapper to avoid GCS calls - mock_lexicon_mapper.map_value.return_value = "GPS" - + """Test that AMPAPI date fields are Date type (not DateTime)""" row = pd.Series( { "PointID": "TEST-004", @@ -216,12 +210,8 @@ def test_make_location_legacy_dates_are_date_not_datetime(mock_lexicon_mapper): assert location.nma_site_date == datetime.date(2002, 12, 10) -@patch("transfers.util.lexicon_mapper") def test_make_location_legacy_dates_independent_of_created_at(mock_lexicon_mapper): - """Test that legacy dates don't affect created_at timestamp""" - # Mock lexicon mapper to avoid GCS calls - mock_lexicon_mapper.map_value.return_value = "GPS" - + """Test that AMPAPI dates don't affect created_at timestamp""" row = pd.Series( { "PointID": "TEST-005", @@ -260,12 +250,8 @@ def test_make_location_legacy_dates_independent_of_created_at(mock_lexicon_mappe # ============================================================================ -@patch("transfers.util.lexicon_mapper") def test_location_legacy_date_coverage_statistics(mock_lexicon_mapper): - """Test that migration preserves expected percentages of legacy dates""" - # Mock lexicon mapper to avoid GCS calls - mock_lexicon_mapper.map_value.return_value = "GPS" - + """Test that migration preserves expected percentages of AMPAPI dates""" # Simulate 100 location records from CSV locations_created = 0 locations_with_site_date = 0 From fd4562a785a2f5bfaba7ddf535bc7d53acce3161 Mon Sep 17 00:00:00 2001 From: Kimball Bighorse Date: Wed, 3 Dec 2025 01:12:35 -0800 Subject: [PATCH 45/66] Replace legacy python timestamp call with current implementation --- tests/features/steps/post_migration_legacy_data.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/features/steps/post_migration_legacy_data.py b/tests/features/steps/post_migration_legacy_data.py index bd7425269..0327d6f0f 100644 --- a/tests/features/steps/post_migration_legacy_data.py +++ b/tests/features/steps/post_migration_legacy_data.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # =============================================================================== -from datetime import date, datetime +from datetime import date, datetime, timezone from behave import given, when, then, register_type from behave.runner import Context import parse @@ -394,7 +394,7 @@ def step_then_all_queryable(context: Context): def step_then_created_at_recent(context: Context): """Assert created_at is recent.""" created_at = context.retrieved_location.created_at.replace(tzinfo=None) - now = datetime.utcnow() + now = datetime.now(timezone.utc).replace(tzinfo=None) diff_seconds = abs((now - created_at).total_seconds()) assert diff_seconds < 3600, "created_at should be within last hour" From 5b1a07dd10eb672441a166e7241af82ea29b77d6 Mon Sep 17 00:00:00 2001 From: Kimball Bighorse Date: Wed, 3 Dec 2025 01:20:16 -0800 Subject: [PATCH 46/66] Preserve timezone in comparison --- tests/features/steps/post_migration_legacy_data.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tests/features/steps/post_migration_legacy_data.py b/tests/features/steps/post_migration_legacy_data.py index 0327d6f0f..6e504734e 100644 --- a/tests/features/steps/post_migration_legacy_data.py +++ b/tests/features/steps/post_migration_legacy_data.py @@ -393,8 +393,13 @@ def step_then_all_queryable(context: Context): @then("created_at should be a recent timestamp") def step_then_created_at_recent(context: Context): """Assert created_at is recent.""" - created_at = context.retrieved_location.created_at.replace(tzinfo=None) - now = datetime.now(timezone.utc).replace(tzinfo=None) + created_at = context.retrieved_location.created_at + now = datetime.now(timezone.utc) + + # Ensure both datetimes are timezone-aware for accurate comparison + if created_at.tzinfo is None: + created_at = created_at.replace(tzinfo=timezone.utc) + diff_seconds = abs((now - created_at).total_seconds()) assert diff_seconds < 3600, "created_at should be within last hour" From b92a9864a38bff51a03a4d0c8500ef81b9161f2e Mon Sep 17 00:00:00 2001 From: Kimball Bighorse Date: Wed, 3 Dec 2025 01:23:16 -0800 Subject: [PATCH 47/66] Make features more human-readable --- .../post-migration-legacy-data-retrieval.feature | 12 ++++++------ tests/features/steps/post_migration_legacy_data.py | 14 +++++++------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/tests/features/post-migration-legacy-data-retrieval.feature b/tests/features/post-migration-legacy-data-retrieval.feature index 99fd08190..13b2b347d 100644 --- a/tests/features/post-migration-legacy-data-retrieval.feature +++ b/tests/features/post-migration-legacy-data-retrieval.feature @@ -33,16 +33,16 @@ Feature: Post-Migration AMPAPI Date Field Retrieval Scenario: List all locations includes AMPAPI date fields Given 5 locations exist with various AMPAPI dates When I GET /location to list all locations - Then each location should have a nma_date_created field - And each location should have a nma_site_date field - And some locations should have null nma_site_date + Then each location should have a date created field + And each location should have a site date field + And some locations should have null site date Scenario: Filter locations by AMPAPI site date range Given locations exist with nma_site_date ranging from 1950 to 2024 When I filter locations where nma_site_date is between "2000-01-01" and "2010-12-31" - Then the response should only include locations with nma_site_date in that decade - And locations with nma_site_date before 2000 should not be included - And locations with nma_site_date after 2010 should not be included + Then the response should only include locations with site date in that decade + And locations with site date before 2000 should not be included + And locations with site date after 2010 should not be included Scenario: Query location by nma_date_created Given 3 locations exist with nma_date_created "2014-04-03" diff --git a/tests/features/steps/post_migration_legacy_data.py b/tests/features/steps/post_migration_legacy_data.py index 6e504734e..bf6e8b443 100644 --- a/tests/features/steps/post_migration_legacy_data.py +++ b/tests/features/steps/post_migration_legacy_data.py @@ -270,28 +270,28 @@ def step_then_time_gap_years(context: Context, years: str): ), f"Expected ~{expected_years} year gap, got {gap_years:.1f} years" -@then("each location should have a nma_date_created field") +@then("each location should have a date created field") def step_then_all_have_legacy_field(context: Context): - """Assert all locations have the field.""" + """Assert all locations have the date created field.""" items = context.locations_response.get("items", []) for item in items: assert "nma_date_created" in item, f"Location missing nma_date_created" -@then("each location should have a nma_site_date field") +@then("each location should have a site date field") def step_then_all_have_site_date_field(context: Context): - """Assert all locations have the field.""" + """Assert all locations have the site date field.""" items = context.locations_response.get("items", []) for item in items: assert "nma_site_date" in item, f"Location missing nma_site_date" -@then("some locations should have null nma_site_date") +@then("some locations should have null site date") def step_then_some_null_site_date(context: Context): - """Assert some locations have null.""" + """Assert some locations have null site date.""" items = context.locations_response.get("items", []) null_count = sum(1 for item in items if item.get("nma_site_date") is None) - assert null_count > 0, "Expected at least one location with null nma_site_date" + assert null_count > 0, "Expected at least one location with null site date" @then("the response should only include locations with site date in that decade") From 94addc7315ee69b35703a1068c8d37f96d328121 Mon Sep 17 00:00:00 2001 From: kbighorse Date: Wed, 3 Dec 2025 09:26:00 +0000 Subject: [PATCH 48/66] Formatting changes --- tests/test_transfer_legacy_dates.py | 32 +++++++++++++++-------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/tests/test_transfer_legacy_dates.py b/tests/test_transfer_legacy_dates.py index c298b129e..e2b4ca0f2 100644 --- a/tests/test_transfer_legacy_dates.py +++ b/tests/test_transfer_legacy_dates.py @@ -255,21 +255,23 @@ def test_location_legacy_date_coverage_statistics(mock_lexicon_mapper): def create_test_row(i, has_site_date): """Helper to create test row with common fields""" - return pd.Series({ - "PointID": f"TEST-{i:03d}", - "Easting": 350000 + i, - "Northing": 3880000 + i, - "DateCreated": "2014-04-03 00:00:00.000", - "SiteDate": "2002-12-10 00:00:00.000" if has_site_date else None, - "Altitude": 1558.8, - "AltDatum": "NAVD88", - "AltitudeMethod": "GPS", - "LocationId": i, - "PublicRelease": True, - "CoordinateNotes": None, - "LocationNotes": None, - "AltitudeAccuracy": None, - }) + return pd.Series( + { + "PointID": f"TEST-{i:03d}", + "Easting": 350000 + i, + "Northing": 3880000 + i, + "DateCreated": "2014-04-03 00:00:00.000", + "SiteDate": "2002-12-10 00:00:00.000" if has_site_date else None, + "Altitude": 1558.8, + "AltDatum": "NAVD88", + "AltitudeMethod": "GPS", + "LocationId": i, + "PublicRelease": True, + "CoordinateNotes": None, + "LocationNotes": None, + "AltitudeAccuracy": None, + } + ) # Simulate 100 location records from CSV (9% with SiteDate, 91% without) locations_created = 0 From 0b4d77d181052170de06ea251dc6881f2a797f0f Mon Sep 17 00:00:00 2001 From: Kimball Bighorse Date: Wed, 3 Dec 2025 01:26:07 -0800 Subject: [PATCH 49/66] Simulate CSV rows more effiiently --- tests/test_transfer_legacy_dates.py | 62 ++++++++++------------------- 1 file changed, 22 insertions(+), 40 deletions(-) diff --git a/tests/test_transfer_legacy_dates.py b/tests/test_transfer_legacy_dates.py index badaec8b2..c298b129e 100644 --- a/tests/test_transfer_legacy_dates.py +++ b/tests/test_transfer_legacy_dates.py @@ -252,55 +252,37 @@ def test_make_location_legacy_dates_independent_of_created_at(mock_lexicon_mappe def test_location_legacy_date_coverage_statistics(mock_lexicon_mapper): """Test that migration preserves expected percentages of AMPAPI dates""" - # Simulate 100 location records from CSV + + def create_test_row(i, has_site_date): + """Helper to create test row with common fields""" + return pd.Series({ + "PointID": f"TEST-{i:03d}", + "Easting": 350000 + i, + "Northing": 3880000 + i, + "DateCreated": "2014-04-03 00:00:00.000", + "SiteDate": "2002-12-10 00:00:00.000" if has_site_date else None, + "Altitude": 1558.8, + "AltDatum": "NAVD88", + "AltitudeMethod": "GPS", + "LocationId": i, + "PublicRelease": True, + "CoordinateNotes": None, + "LocationNotes": None, + "AltitudeAccuracy": None, + }) + + # Simulate 100 location records from CSV (9% with SiteDate, 91% without) locations_created = 0 locations_with_site_date = 0 + elevations = {} for i in range(100): - if i < 9: # 9% have SiteDate - row = pd.Series( - { - "PointID": f"TEST-{i:03d}", - "Easting": 350000 + i, - "Northing": 3880000 + i, - "DateCreated": "2014-04-03 00:00:00.000", - "SiteDate": "2002-12-10 00:00:00.000", - "Altitude": 1558.8, - "AltDatum": "NAVD88", - "AltitudeMethod": "GPS", - "LocationId": i, - "PublicRelease": True, - "CoordinateNotes": None, - "LocationNotes": None, - "AltitudeAccuracy": None, - } - ) - else: # 91% don't have SiteDate - row = pd.Series( - { - "PointID": f"TEST-{i:03d}", - "Easting": 350000 + i, - "Northing": 3880000 + i, - "DateCreated": "2014-04-03 00:00:00.000", - "SiteDate": None, - "Altitude": 1558.8, - "AltDatum": "NAVD88", - "AltitudeMethod": "GPS", - "LocationId": i, - "PublicRelease": True, - "CoordinateNotes": None, - "LocationNotes": None, - "AltitudeAccuracy": None, - } - ) - - elevations = {} + row = create_test_row(i, has_site_date=(i < 9)) location, _ = make_location(row, elevations) # Count coverage if location.nma_date_created is not None: locations_created += 1 - if location.nma_site_date is not None: locations_with_site_date += 1 From 2d12844f305091758277b45a49976170397c06e3 Mon Sep 17 00:00:00 2001 From: Kimball Bighorse Date: Wed, 3 Dec 2025 01:39:05 -0800 Subject: [PATCH 50/66] Replace `legacy_` in method names --- .../steps/post_migration_legacy_data.py | 26 +++++++++---------- tests/test_location.py | 10 +++---- tests/test_transfer_legacy_dates.py | 8 +++--- 3 files changed, 22 insertions(+), 22 deletions(-) diff --git a/tests/features/steps/post_migration_legacy_data.py b/tests/features/steps/post_migration_legacy_data.py index bf6e8b443..5850bf04e 100644 --- a/tests/features/steps/post_migration_legacy_data.py +++ b/tests/features/steps/post_migration_legacy_data.py @@ -91,9 +91,9 @@ def step_given_multiple_locations(context: Context, count: int): ] for i in range(min(count, len(test_data))): - legacy_date, site_date = test_data[i] + created_date, site_date = test_data[i] location = create_test_location( - nma_date_created=date.fromisoformat(legacy_date), + nma_date_created=date.fromisoformat(created_date), nma_site_date=(date.fromisoformat(site_date) if site_date else None), ) context.test_locations.append(location) @@ -203,7 +203,7 @@ def step_when_filter_locations(context: Context, start_date: str, end_date: str) @when('I query for locations with nma_date_created "{target_date}"') -def step_when_query_by_legacy_date(context: Context, target_date: str): +def step_when_query_by_ampapi_date(context: Context, target_date: str): """Query locations by nma_date_created.""" with session_ctx() as session: target = date.fromisoformat(target_date) @@ -251,16 +251,16 @@ def step_then_nma_site_date(context: Context, expected_date: str): @then("the time gap should be approximately {years} years") def step_then_time_gap_years(context: Context, years: str): """Assert approximate year gap.""" - legacy_str = context.location_response.get("nma_date_created") + created_str = context.location_response.get("nma_date_created") site_date_str = context.location_response.get("nma_site_date") - if not legacy_str or not site_date_str: + if not created_str or not site_date_str: raise AssertionError("Missing date fields for gap calculation") - legacy_date = date.fromisoformat(legacy_str) + created_date = date.fromisoformat(created_str) site_date = date.fromisoformat(site_date_str) - gap_days = (legacy_date - site_date).days + gap_days = (created_date - site_date).days gap_years = gap_days / 365.25 expected_years = float(years) @@ -271,7 +271,7 @@ def step_then_time_gap_years(context: Context, years: str): @then("each location should have a date created field") -def step_then_all_have_legacy_field(context: Context): +def step_then_all_have_date_created_field(context: Context): """Assert all locations have the date created field.""" items = context.locations_response.get("items", []) for item in items: @@ -371,7 +371,7 @@ def step_then_has_created_at(context: Context): @then("it should have nma_date_created (original AMPAPI DateCreated)") -def step_then_has_legacy_date(context: Context): +def step_then_has_ampapi_date_created(context: Context): """Assert nma_date_created exists.""" assert context.retrieved_location.nma_date_created is not None @@ -405,14 +405,14 @@ def step_then_created_at_recent(context: Context): @then("nma_date_created should be an older date") -def step_then_legacy_date_older(context: Context): +def step_then_ampapi_date_older(context: Context): """Assert nma_date_created is old.""" - legacy_date = context.retrieved_location.nma_date_created - assert legacy_date.year < 2024, "nma_date_created should be from the past" + ampapi_created_date = context.retrieved_location.nma_date_created + assert ampapi_created_date.year < 2024, "nma_date_created should be from the past" @then('nma_date_created should be "{expected_date}"') -def step_then_legacy_is(context: Context, expected_date: str): +def step_then_ampapi_created_is(context: Context, expected_date: str): """Assert nma_date_created value.""" actual = context.retrieved_location.nma_date_created expected = date.fromisoformat(expected_date) diff --git a/tests/test_location.py b/tests/test_location.py index 6e143f1eb..9dcb3d098 100644 --- a/tests/test_location.py +++ b/tests/test_location.py @@ -235,11 +235,11 @@ def test_delete_location_404_not_found(second_location): assert data["detail"] == f"Location with ID {bad_location_id} not found." -# ============= Legacy date field tests ======================================= +# ============= AMPAPI date field tests ======================================= -def test_new_location_has_null_legacy_fields(): - """Test that newly created locations have null legacy date fields (legacy fields are migration-only)""" +def test_new_location_has_null_ampapi_fields(): + """Test that newly created locations have null AMPAPI date fields (AMPAPI fields are migration-only)""" payload = { "point": "POINT (-106.607784 35.118924)", "elevation": 1558.8, @@ -260,7 +260,7 @@ def test_new_location_has_null_legacy_fields(): cleanup_post_test(Location, data["id"]) -def test_legacy_fields_present_in_location_response(): +def test_ampapi_fields_present_in_location_response(): """Test that AMPAPI date fields (read-only) are included in location GET response""" # Create a new location (without AMPAPI date fields set - they're read-only) payload = { @@ -287,7 +287,7 @@ def test_legacy_fields_present_in_location_response(): cleanup_post_test(Location, location_id) -def test_legacy_fields_independent_of_created_at(): +def test_ampapi_fields_independent_of_created_at(): """Test that created_at (system timestamp) is separate from AMPAPI date fields (read-only)""" payload = { "point": "POINT (-106.607784 35.118924)", diff --git a/tests/test_transfer_legacy_dates.py b/tests/test_transfer_legacy_dates.py index c298b129e..d700ab470 100644 --- a/tests/test_transfer_legacy_dates.py +++ b/tests/test_transfer_legacy_dates.py @@ -46,7 +46,7 @@ def mock_lexicon_mapper(): # ============================================================================ -def test_make_location_with_both_legacy_dates(mock_lexicon_mapper): +def test_make_location_with_both_ampapi_dates(mock_lexicon_mapper): """Test that make_location populates both nma_date_created and nma_site_date""" # Create a mock CSV row with both DateCreated and SiteDate @@ -175,7 +175,7 @@ def test_make_location_with_very_old_site_date(mock_lexicon_mapper): assert time_gap == 19751 # Approximately 54 years -def test_make_location_legacy_dates_are_date_not_datetime(mock_lexicon_mapper): +def test_make_location_ampapi_dates_are_date_not_datetime(mock_lexicon_mapper): """Test that AMPAPI date fields are Date type (not DateTime)""" row = pd.Series( { @@ -210,7 +210,7 @@ def test_make_location_legacy_dates_are_date_not_datetime(mock_lexicon_mapper): assert location.nma_site_date == datetime.date(2002, 12, 10) -def test_make_location_legacy_dates_independent_of_created_at(mock_lexicon_mapper): +def test_make_location_ampapi_dates_independent_of_created_at(mock_lexicon_mapper): """Test that AMPAPI dates don't affect created_at timestamp""" row = pd.Series( { @@ -250,7 +250,7 @@ def test_make_location_legacy_dates_independent_of_created_at(mock_lexicon_mappe # ============================================================================ -def test_location_legacy_date_coverage_statistics(mock_lexicon_mapper): +def test_location_ampapi_date_coverage_statistics(mock_lexicon_mapper): """Test that migration preserves expected percentages of AMPAPI dates""" def create_test_row(i, has_site_date): From 8c96e72d21fedcc7b939eb63b1e18effb1ac7eda Mon Sep 17 00:00:00 2001 From: Kimball Bighorse Date: Wed, 3 Dec 2025 01:46:39 -0800 Subject: [PATCH 51/66] Increase code test coverage --- .coverage | Bin 0 -> 53248 bytes tests/test_transfer_legacy_dates.py | 56 ++++ transfers/util.py,cover | 461 ++++++++++++++++++++++++++++ 3 files changed, 517 insertions(+) create mode 100644 .coverage create mode 100644 transfers/util.py,cover diff --git a/.coverage b/.coverage new file mode 100644 index 0000000000000000000000000000000000000000..5417251745baf9f63193ac63e03d4d4b0edab5c4 GIT binary patch literal 53248 zcmeI)&u$Y(90%}S+t^vh)&|kYiXxQhftrXU#_geMdw@`)3Y8WpX)l#9V|$!z@$Q=4 zH6(|sHB>@E>Kjyj1o{X(L!W>iIrY{{p;CXpon8M4HsVkeNWUxF**`P0GoSfq$JSiG zcG-!P==gyvBk_T;V3?+HUI@c5EP9sdnQjHz$)yMM*BslQwQCt058o-<-x#I*&xU=Y ze8pZb{Zf8!{`Zn=J)HlwAgvZ%zy<*bKmY>&Z-K$JQlYZ6WZwHelFhD)0@+f5el36c z)z;?rme}6BaCuAUbK-PS(6+uVHbvmy6+IP*j?-1b@!C#HMvk{5qFt4S?uV+)6CItS zsg46~#PNBr;j}4Mq;@EYUf{Sg*cZ3d{#qO$xm%?6B7K4qQO=IXA;jq<`l<+2M+M4j zsZi(Qw9{TKuHOHlSg4#hVLniCnPkwS$NWTT=)|ZfmpYVC%MaR|`(_}$)~*WIgdCQz zuE4uQ8ob z^XT_WqH}aFhzH;46smoD$ zC#o|g=Ni?q`kr&UuZESsCPp#W;$nFIn&3iE3ph2OuY7dEG^kaoz3_II-fuNzKl0<_ z233BePA{dOj?Wb;D=X%M+i^o-o<`G&!r_iJ(P)OJXEYtYKD*s;FPUs4+>5$h;dkOL z!?A^~9BR$ueWsRT$bdVV@RNX)(?e}-oLS0)Xm-4=Aca@OK z(DLu7K<=oGy)2Une%GS!PrZa2;jrwr6I9SRTD$C2}Ju7ZTNo^&63KLBz zt;s3FXnM(2GVBNGK=QIdGL11BCDT5VT%qEu7xII7 zgFZjA6Itdnqbz@}_Sf&oE{!HV=>*heX$;mgfZ%{_Nr7e4??-X_<-@pn^4v4ZuTEfe zoZ3^U)i^kH@2h?oIy9~Fv=x4wo)y(y8S*F;Rz=hIyGnX`ZlD;KLb|6DiZ~B1&PwM- zX@vWn#?GB;oTqeh;bDH8I$heoXvcM>*G}duo`!X*H1>1nDZ5=-kfp6^w0?oS#0wY&wC<7beHQrrUhT^-u1mMqRhNBU4T%;zsnJqKUP#Ln zg|&X3QT4FV8=00bZa0SG_<0uX=z1Rwx`H$&iRZg9%_FF*fd{l7P( zVv!37KmY;|fB*y_009U<00Izzz-to7npvZ)zyE)3*w0_nU63pUAOHafKmY;|fB*y_ z009U<00M_9kj<7Y{r&%A!+w0IS;5UA009U<00Izz00bZa0SG_<0uY!jaLzntSp&NM z_=)xR%Dc3)aMAeDSjZX1oI!^d+Bb~zOHcl(7RJ*MqIX;$=k)jg&kXz7cmV7}00Izz z00bZa0SG_<0uX=z1R!uk1adjciogFi7LG`XPz(q_00Izz00bZa0SG_<0uX=z1f~k` z_y73(Ka~b2ApijgKmY;|fB*y_009U<00Kuw0PFuBnW{uVAOHafKmY;|fB*y_009U< M00Pql;`RUk1r+UXZU6uP literal 0 HcmV?d00001 diff --git a/tests/test_transfer_legacy_dates.py b/tests/test_transfer_legacy_dates.py index 5129b7d9c..985214fbb 100644 --- a/tests/test_transfer_legacy_dates.py +++ b/tests/test_transfer_legacy_dates.py @@ -250,6 +250,62 @@ def test_make_location_ampapi_dates_independent_of_created_at(mock_lexicon_mappe # ============================================================================ +def test_make_location_with_no_ampapi_dates(mock_lexicon_mapper): + """Test that make_location handles locations with no AMPAPI dates (both null)""" + row = pd.Series( + { + "PointID": "TEST-NODATES", + "Easting": 350000, + "Northing": 3880000, + "DateCreated": None, # No DateCreated + "SiteDate": None, # No SiteDate + "Altitude": 1558.8, + "AltDatum": "NAVD88", + "AltitudeMethod": "GPS", + "LocationId": 999, + "PublicRelease": True, + "CoordinateNotes": None, + "LocationNotes": None, + "AltitudeAccuracy": None, + } + ) + + elevations = {} + location, elevation_method = make_location(row, elevations) + + # Both AMPAPI date fields should be null + assert location.nma_date_created is None + assert location.nma_site_date is None + + +def test_make_location_with_empty_string_dates(mock_lexicon_mapper): + """Test that make_location handles empty string dates (CSV edge case)""" + row = pd.Series( + { + "PointID": "TEST-EMPTY", + "Easting": 350000, + "Northing": 3880000, + "DateCreated": "", # Empty string + "SiteDate": "", # Empty string + "Altitude": 1558.8, + "AltDatum": "NAVD88", + "AltitudeMethod": "GPS", + "LocationId": 998, + "PublicRelease": True, + "CoordinateNotes": None, + "LocationNotes": None, + "AltitudeAccuracy": None, + } + ) + + elevations = {} + location, elevation_method = make_location(row, elevations) + + # Both AMPAPI date fields should be null (empty strings are falsy) + assert location.nma_date_created is None + assert location.nma_site_date is None + + def test_location_ampapi_date_coverage_statistics(mock_lexicon_mapper): """Test that migration preserves expected percentages of AMPAPI dates""" diff --git a/transfers/util.py,cover b/transfers/util.py,cover new file mode 100644 index 000000000..5c2803392 --- /dev/null +++ b/transfers/util.py,cover @@ -0,0 +1,461 @@ + # =============================================================================== + # Copyright 2025 ross + # + # Licensed under the Apache License, Version 2.0 (the "License"); + # you may not use this file except in compliance with the License. + # You may obtain a copy of the License at + # + # http://www.apache.org/licenses/LICENSE-2.0 + # + # Unless required by applicable law or agreed to in writing, software + # distributed under the License is distributed on an "AS IS" BASIS, + # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + # See the License for the specific language governing permissions and + # limitations under the License. + # =============================================================================== +> import csv +> import io +> import os +> import re +> from datetime import datetime, timezone, timedelta +> from pathlib import Path + +> import numpy as np +> import pandas as pd +> import pytz +> from shapely import Point +> from sqlalchemy import select +> from sqlalchemy.orm import Session + +> from constants import SRID_WGS84, SRID_UTM_ZONE_13N +> from db import Thing, Location, DataProvenance +> from services.gcs_helper import get_storage_bucket + + # from services.lexicon_mapper import lexicon_mapper +> from services.util import ( +> transform_srid, +> get_epqs_elevation_from_point, +> convert_ft_to_m, +> convert_ngvd29_to_navd88, +> ) +> from transfers.logger import logger + +> NMA_COORDINATE_ACCURACY = { +> "5m": (5, "m"), +> "1": (0.1, "second"), +> "5": (0.5, "second"), +> "F": (5, "second"), +> "H": (0.01, "second"), +> "M": (1, "minute"), +> "R": (3, "second"), +> "S": (1, "second"), +> "T": (10, "second"), +> } + + +> def replace_nans(df: pd.DataFrame, default=None) -> pd.DataFrame: +! df = df.replace(pd.NA, default) +! return df.replace({np.nan: default}) + + +> def read_csv(name: str, dtype: dict | None = None) -> pd.DataFrame: + # Try to read from local data directory first +! local_file = Path(__file__).parent / "data" / f"{name}.csv" + +! if local_file.exists(): +! logger.info(f"Reading {name} from local file: {local_file}") +! if dtype: +! return pd.read_csv(local_file, dtype=dtype) +! else: +! return pd.read_csv(local_file) + + # Check cache directory +! p = get_transfers_data_path(Path("nma_csv_cache") / f"{name}.csv") +! if os.path.exists(p): +! logger.info(f"Reading {name} from cache: {p}") +! return pd.read_csv(p, dtype=dtype) + + # Fall back to GCS if local file doesn't exist +! logger.info(f"Local file and cache not found, reading {name} from GCS") +! bucket = get_storage_bucket() +! blob = bucket.blob(f"nma_csv/{name}.csv") +! data = blob.download_as_bytes() +! with open(p, "wb") as f: +! f.write(data) + +! if dtype: +! return pd.read_csv(io.BytesIO(data), dtype=dtype) +! else: +! return pd.read_csv(io.BytesIO(data)) + + +> def get_valid_point_ids(session, thing_type="water well"): +! things = get_valid_things(session, thing_type) +! valid_pointids = [thing.name for thing in things] +! return valid_pointids + + +> def get_valid_things(session, thing_type="water well"): +! return session.query(Thing).where(Thing.thing_type == thing_type).all() + + +> def extract_organization(alternate_id: str) -> str: +! if alternate_id.startswith("TWDB"): +! return "TWDB" +! elif alternate_id.startswith("NMED"): +! return "NMED" + + # TODO: There are a bunch of other formats used for AlternateSiteID. + # we should try to handle as many as possible but its not the end of the world + # if we have to update the organization for a particular alternate id at a later time +! for regex, org in ((r"^A-Z{1,2}-\d{5,6}$", "NMOSE"), (r"\d+(\.\d+){3,}", "PLSS")): + +! if re.match(regex, alternate_id): +! return org + +! return "Unknown" + + +> def get_transfers_data_path(name): +! def data_path(r): +! return Path(r) / "transfers" / "data" + +! root = data_path("/workspace") +! if not os.path.exists(root): +! root = data_path("..") +! if not os.path.exists(root): +! root = data_path(".") + +! return root / name + + +> def filter_non_transferred_wells(sess: Session, df: pd.DataFrame) -> pd.DataFrame: +! sql = select(Thing.name).where(Thing.thing_type == "water well") +! existing_ids = sess.execute(sql).scalars().all() +! return df[~(df["PointID"].isin(existing_ids))] + + +> def filter_by_welldata_datasource_and_project(df: pd.DataFrame) -> pd.DataFrame: +! path = get_transfers_data_path("valid_welldata_datasources.csv") +! with open(path, "r") as f: +! reader = csv.reader(f) +! _ = next(reader) +! valid_datasources = [row[0] for row in reader if row[1] == "Yes"] +! f.seek(0) +! invalid_datasources = [row[0] for row in reader if row[1] == "NO"] +! logger.info("Invalid WellData Datasources:") +! for vd in invalid_datasources: +! logger.info(f" {vd}") + +! counts = df.groupby("DataSource").size().reset_index(name="WellCount") +! counts = counts.sort_values("WellCount", ascending=False) +! for count in counts.itertuples(): +! logger.info(f"{count.DataSource}: {count.WellCount}") + +! pldf = read_csv("ProjectLocations") +! collabnet = pldf[pldf["ProjectName"] == "Water Level Network"] +! return df[ +! df["DataSource"].isin(valid_datasources) +! | df["PointID"].isin(collabnet["PointID"]) +! ] + + +> def filter_by_valid_measuring_agency(df: pd.DataFrame) -> pd.DataFrame: +! path = get_transfers_data_path("valid_measuring_agency.csv") + +! with open(path, "r") as f: +! reader = csv.reader(f) +! _ = next(reader) +! valid_measuring_agencies = [row[0] for row in reader if row[1] == "Yes"] +! logger.info("Valid Measuring Agencies:") +! for vma in valid_measuring_agencies: +! logger.info(f" {vma}") +! return df[df["MeasuringAgency"].isin(valid_measuring_agencies)] + + +> def filter_to_valid_point_ids(session: Session, df: pd.DataFrame) -> pd.DataFrame: +! valid_point_ids = get_valid_point_ids(session) +! return df[df["PointID"].isin(valid_point_ids)] + + +> def convert_mt_to_utc(dt_record: datetime): +! t = dt_record.time() +! if t.hour == 0 and t.minute == 0: + # no time was measured, so just set the timezone to UTC and keep + # time at 00:00 +! dt_record = dt_record.replace(tzinfo=timezone.utc) +! else: +! tz = pytz.timezone("America/Denver") +! dt_record = tz.localize(dt_record) +! if dt_record.dst() == timedelta(0): + # MST +! utc_offset = 7 +! else: + # MDT +! utc_offset = 6 +! dt_record = dt_record - timedelta(hours=utc_offset) +! dt_record = dt_record.replace(tzinfo=timezone.utc) +! return dt_record + + +> def chunk_by_size(df, chunk_size): +! for i in range(0, len(df), chunk_size): +! yield df.iloc[i : i + chunk_size] + + +> def make_location(row: pd.Series, elevations: dict) -> tuple: +> """ +> Returns a tuple of location data and the elevation method +> """ +> point = Point(row.Easting, row.Northing) + + # Convert the point to a WGS84 coordinate system +> transformed_point = transform_srid( +> point, source_srid=SRID_UTM_ZONE_13N, target_srid=SRID_WGS84 +> ) + +> z = row.Altitude +> if z: +> elevation_from_epqs = False +> z = convert_ft_to_m(z) + +> if row.AltDatum == "NGVD29": +! key = f"{row.PointID}, {transformed_point.x, transformed_point.y}" +! if key in elevations: +! z = elevations[key] +! else: +! z = convert_ngvd29_to_navd88( +! z, transformed_point.x, transformed_point.y +! ) +! elevations[key] = z +! else: +! elevation_from_epqs = True +! logger.info( +! f"Location {row.PointID} has no Altitude. Setting from National Map EPQS for " +! ) +! z = get_epqs_elevation_from_point(transformed_point.x, transformed_point.y) + +> if elevation_from_epqs: +! elevation_method = "USGS National Elevation Dataset (NED)" +> elif pd.isna(row.AltitudeMethod): +! elevation_method = None +> else: +> elevation_method = lexicon_mapper.map_value( +> f"LU_AltitudeMethod:{row.AltitudeMethod.strip()}" +> ) + + # Extract AMPAPI date fields (Date type, not DateTime) +> nma_date_created = None +> if row.DateCreated: +> nma_date_created = datetime.strptime( +> row.DateCreated, "%Y-%m-%d %H:%M:%S.%f" +> ).date() + +> nma_site_date = None +> if row.SiteDate: +> nma_site_date = datetime.strptime(row.SiteDate, "%Y-%m-%d %H:%M:%S.%f").date() + +> location = Location( +> nma_pk_location=row.LocationId, +> point=transformed_point.wkt, +> elevation=z, +> release_status="public" if row.PublicRelease else "private", +> nma_coordinate_notes=row.CoordinateNotes, +> nma_notes_location=row.LocationNotes, +> nma_date_created=nma_date_created, +> nma_site_date=nma_site_date, +> ) + +> return location, elevation_method + + +> def make_location_data_provenance( +> row: pd.Series, location: Location, elevation_method: str | None +> ) -> list[DataProvenance]: +! provenance_records = [] + +! if row.AltitudeAccuracy or row.CoordinateAccuracy: +! provenance = DataProvenance( +! target_id=location.id, +! target_table="location", +! field_name="elevation", +! origin_source=None, +! collection_method=elevation_method, +! accuracy_value=( +! None +! if pd.isna(row.AltitudeAccuracy) +! else convert_ft_to_m(row.AltitudeAccuracy) +! ), +! accuracy_unit="m", +! ) +! provenance_records.append(provenance) + + # TODO: AMP feedback is required for transfering coordinate accuracy values + # from NM_Aquifer to Ocotillo + # if row.CoordinateAccuracy == "U" or pd.isna(row.CoordinateAccuracy): + # # map "Unknown" to None + # row.CoordinateAccuracy = None + # elif row.CoordinateAccuracy == "5m": + # row.CoordinateAccuracy = 5.0 + # else: + # seconds = 0 + # minutes = 0 + # if row.CoordinateAccuracy == "1": + # seconds = 0.1 + # elif row.CoordinateAccuracy == "5": + # seconds = 0.5 + # elif row.CoordinateAccuracy == "F": + # seconds = 5 + # elif row.CoordinateAccuracy == "H": + # seconds = 0.01 + # elif row.CoordinateAccuracy == "M": + # minutes = 1 + # elif row.CoordinateAccuracy == "R": + # seconds = 3 + # elif row.CoordinateAccuracy == "S": + # seconds = 1 + # else: + # seconds = 10 + # coordinate_accuracy_decimal_deg = minutes/60 + seconds / 3600 + + # """ + # Developer's notes + + # To convert accuracy from decimal degrees to meters we do the following: + + # 1. Add the coordinate accuracy to both the latitude and longitude to + # find the "+" distance from the location + # 2. Convert "+" accuracy coordinates from decimal degrees to UTM Zone 13 + # N + # 3. Find the distance in meters from the original Easting/Northing and + # define this as the "+" accuracy in meters + # 4. Subtract the coordinate accuracy to both the latitude and longitude + # to find the "-" distance from the location + # 5. Convert the "-" accuracy coordinates from decimal degrees to UTM Zone + # 13 N + # 6. Find the distance in meters from the original Easting/Northing and + # define this as the "-" accuracy in meters + # 7. Set the coordinate accuracy in meters as the mean of the "+" and "-" + # distances from the location + # """ + # original_longitude = transformed_point.x + # original_latitude = transformed_point.y + + # plus_longitude = original_longitude + coordinate_accuracy_decimal_deg + # plus_latitude = original_latitude + coordinate_accuracy_decimal_deg + # plus_point_decimal_deg = Point(plus_longitude, plus_latitude) + # plus_point_utm_zone_13_n = transform_srid( + # plus_point_decimal_deg, + # SRID_WGS84, + # SRID_UTM_ZONE_13N) + + # minus_longitude = original_longitude - coordinate_accuracy_decimal_deg + # minus_latitude = original_latitude - coordinate_accuracy_decimal_deg + # minus_point_decimal_deg = Point(minus_longitude, minus_latitude) + +! if row.CoordinateMethod or row.CoordinateAccuracy: +! coordinate_method = ( +! lexicon_mapper.map_value(f"LU_CoordinateMethod:{row.CoordinateMethod}") +! if not pd.isna(row.CoordinateMethod) +! else None +! ) + +! accuracy_value, accuracy_unit = NMA_COORDINATE_ACCURACY.get( +! row.CoordinateAccuracy, (None, None) +! ) + +! provenance = DataProvenance( +! target_id=location.id, +! target_table="location", +! field_name="point", +! origin_source=None, +! collection_method=coordinate_method, +! accuracy_value=accuracy_value, +! accuracy_unit=accuracy_unit, +! ) +! provenance_records.append(provenance) + +! return provenance_records + + +> def timeit_direct(func, *args, **kwargs): +! start = datetime.now() +! result = func(*args, **kwargs) +! end = datetime.now() +! logger.info(f"TIMING: {func.__name__} took {(end - start).total_seconds()} seconds") +! return result + + +> def timeit(func): +! def wrapper(*args, **kwargs): +! return timeit_direct(func, *args, **kwargs) + +! return wrapper + + +> class LexiconMapper: +> def __init__(self): +> self._mappers = None + +> def map_value(self, value): +! value = value.strip() +! return self._make_lu_to_lexicon_mapper().get(value, value) + +> def _make_lu_to_lexicon_mapper(self): +! if self._mappers: +! return self._mappers + + # Lookup tables where CODE maps to MEANING +! lu_tables = [ +! "LU_AltitudeMethod", +! "LU_CollectionMethod", +! "LU_ConstructionMethod", +! "LU_CoordinateAccuracy", +! "LU_CoordinateMethod", +! "LU_CurrentUse", +! "LU_DataQuality", +! "LU_DataSource", +! "LU_Depth_CompletionSource", +! "LU_Discharge_ChemistrySource", +! "LU_LevelStatus", +! "LU_MajorAnalyte", +! "LU_MeasurementMethod", +! "LU_MinorTraceAnalyte", +! "LU_MonitoringStatus", +! "LU_SampleType", +! "LU_SiteType", +! "LU_Status", +! ] + + # Lookup tables intentionally skipped (kept for documentation only) + # Each entry explains why the table is excluded +! _lu_tables_skipped = { +! "LU_AltitudeDatum": "code is the value, so no need for mapping", +! "LU_CoordinateDatum": "code is the value, so no need for mapping", +! "LU_FieldNoteTypes": "not being used in the transfers since there are no records", +! "LU_Formations": "needs to be cleaned before it can be used", +! "LU_Lithology": "needs to be cleaned before it can be used", +! "LU_MeasuringAgency": "the abbreviation is what is used in the new schema", +! } +! mappers = {} + +! for lu_table in lu_tables: +! table = read_csv(lu_table) + +! for i, row in table.iterrows(): +! if lu_table == "LU_Formations": +! code = row.Code +! meaning = row.Meaning +! else: +! code = row.CODE +! meaning = row.MEANING + +! mappers.update({f"{lu_table}:{code}": meaning}) +! self._mappers = mappers +! return mappers + + +> lexicon_mapper = LexiconMapper() + + + # ============= EOF ============================================= From 48f503d1afbab2579e447522001e66ab9fcc5543 Mon Sep 17 00:00:00 2001 From: Kimball Bighorse Date: Wed, 3 Dec 2025 01:49:34 -0800 Subject: [PATCH 52/66] Enforce timezone info on `created_at` --- tests/features/steps/post_migration_legacy_data.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tests/features/steps/post_migration_legacy_data.py b/tests/features/steps/post_migration_legacy_data.py index 5850bf04e..3baa7f5f3 100644 --- a/tests/features/steps/post_migration_legacy_data.py +++ b/tests/features/steps/post_migration_legacy_data.py @@ -396,9 +396,14 @@ def step_then_created_at_recent(context: Context): created_at = context.retrieved_location.created_at now = datetime.now(timezone.utc) - # Ensure both datetimes are timezone-aware for accurate comparison + # created_at should always be timezone-aware (configured in AutoBaseMixin with DateTime(timezone=True)) + # If it's naive, this indicates a database/ORM configuration issue if created_at.tzinfo is None: - created_at = created_at.replace(tzinfo=timezone.utc) + raise AssertionError( + "created_at is a naive datetime (no timezone info). " + "Ensure the database and ORM are configured to return timezone-aware datetimes in UTC. " + "AutoBaseMixin.created_at uses DateTime(timezone=True) with server_default=func.timezone('UTC', func.now())" + ) diff_seconds = abs((now - created_at).total_seconds()) assert diff_seconds < 3600, "created_at should be within last hour" From 43a8c5f5649ccd7b42ba826c4da8e150ffbd51c2 Mon Sep 17 00:00:00 2001 From: Kimball Bighorse Date: Wed, 3 Dec 2025 01:51:21 -0800 Subject: [PATCH 53/66] Ignore test coverage artifacts --- .gitignore | 7 + transfers/util.py,cover | 461 ---------------------------------------- 2 files changed, 7 insertions(+), 461 deletions(-) delete mode 100644 transfers/util.py,cover diff --git a/.gitignore b/.gitignore index 44b28e13c..4bf6245e0 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,13 @@ dist/ wheels/ *.egg-info +# Test coverage reports +*.cover +.coverage +.coverage.* +htmlcov/ +coverage.xml + # Virtual environments .venv requirements.txt diff --git a/transfers/util.py,cover b/transfers/util.py,cover deleted file mode 100644 index 5c2803392..000000000 --- a/transfers/util.py,cover +++ /dev/null @@ -1,461 +0,0 @@ - # =============================================================================== - # Copyright 2025 ross - # - # Licensed under the Apache License, Version 2.0 (the "License"); - # you may not use this file except in compliance with the License. - # You may obtain a copy of the License at - # - # http://www.apache.org/licenses/LICENSE-2.0 - # - # Unless required by applicable law or agreed to in writing, software - # distributed under the License is distributed on an "AS IS" BASIS, - # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - # See the License for the specific language governing permissions and - # limitations under the License. - # =============================================================================== -> import csv -> import io -> import os -> import re -> from datetime import datetime, timezone, timedelta -> from pathlib import Path - -> import numpy as np -> import pandas as pd -> import pytz -> from shapely import Point -> from sqlalchemy import select -> from sqlalchemy.orm import Session - -> from constants import SRID_WGS84, SRID_UTM_ZONE_13N -> from db import Thing, Location, DataProvenance -> from services.gcs_helper import get_storage_bucket - - # from services.lexicon_mapper import lexicon_mapper -> from services.util import ( -> transform_srid, -> get_epqs_elevation_from_point, -> convert_ft_to_m, -> convert_ngvd29_to_navd88, -> ) -> from transfers.logger import logger - -> NMA_COORDINATE_ACCURACY = { -> "5m": (5, "m"), -> "1": (0.1, "second"), -> "5": (0.5, "second"), -> "F": (5, "second"), -> "H": (0.01, "second"), -> "M": (1, "minute"), -> "R": (3, "second"), -> "S": (1, "second"), -> "T": (10, "second"), -> } - - -> def replace_nans(df: pd.DataFrame, default=None) -> pd.DataFrame: -! df = df.replace(pd.NA, default) -! return df.replace({np.nan: default}) - - -> def read_csv(name: str, dtype: dict | None = None) -> pd.DataFrame: - # Try to read from local data directory first -! local_file = Path(__file__).parent / "data" / f"{name}.csv" - -! if local_file.exists(): -! logger.info(f"Reading {name} from local file: {local_file}") -! if dtype: -! return pd.read_csv(local_file, dtype=dtype) -! else: -! return pd.read_csv(local_file) - - # Check cache directory -! p = get_transfers_data_path(Path("nma_csv_cache") / f"{name}.csv") -! if os.path.exists(p): -! logger.info(f"Reading {name} from cache: {p}") -! return pd.read_csv(p, dtype=dtype) - - # Fall back to GCS if local file doesn't exist -! logger.info(f"Local file and cache not found, reading {name} from GCS") -! bucket = get_storage_bucket() -! blob = bucket.blob(f"nma_csv/{name}.csv") -! data = blob.download_as_bytes() -! with open(p, "wb") as f: -! f.write(data) - -! if dtype: -! return pd.read_csv(io.BytesIO(data), dtype=dtype) -! else: -! return pd.read_csv(io.BytesIO(data)) - - -> def get_valid_point_ids(session, thing_type="water well"): -! things = get_valid_things(session, thing_type) -! valid_pointids = [thing.name for thing in things] -! return valid_pointids - - -> def get_valid_things(session, thing_type="water well"): -! return session.query(Thing).where(Thing.thing_type == thing_type).all() - - -> def extract_organization(alternate_id: str) -> str: -! if alternate_id.startswith("TWDB"): -! return "TWDB" -! elif alternate_id.startswith("NMED"): -! return "NMED" - - # TODO: There are a bunch of other formats used for AlternateSiteID. - # we should try to handle as many as possible but its not the end of the world - # if we have to update the organization for a particular alternate id at a later time -! for regex, org in ((r"^A-Z{1,2}-\d{5,6}$", "NMOSE"), (r"\d+(\.\d+){3,}", "PLSS")): - -! if re.match(regex, alternate_id): -! return org - -! return "Unknown" - - -> def get_transfers_data_path(name): -! def data_path(r): -! return Path(r) / "transfers" / "data" - -! root = data_path("/workspace") -! if not os.path.exists(root): -! root = data_path("..") -! if not os.path.exists(root): -! root = data_path(".") - -! return root / name - - -> def filter_non_transferred_wells(sess: Session, df: pd.DataFrame) -> pd.DataFrame: -! sql = select(Thing.name).where(Thing.thing_type == "water well") -! existing_ids = sess.execute(sql).scalars().all() -! return df[~(df["PointID"].isin(existing_ids))] - - -> def filter_by_welldata_datasource_and_project(df: pd.DataFrame) -> pd.DataFrame: -! path = get_transfers_data_path("valid_welldata_datasources.csv") -! with open(path, "r") as f: -! reader = csv.reader(f) -! _ = next(reader) -! valid_datasources = [row[0] for row in reader if row[1] == "Yes"] -! f.seek(0) -! invalid_datasources = [row[0] for row in reader if row[1] == "NO"] -! logger.info("Invalid WellData Datasources:") -! for vd in invalid_datasources: -! logger.info(f" {vd}") - -! counts = df.groupby("DataSource").size().reset_index(name="WellCount") -! counts = counts.sort_values("WellCount", ascending=False) -! for count in counts.itertuples(): -! logger.info(f"{count.DataSource}: {count.WellCount}") - -! pldf = read_csv("ProjectLocations") -! collabnet = pldf[pldf["ProjectName"] == "Water Level Network"] -! return df[ -! df["DataSource"].isin(valid_datasources) -! | df["PointID"].isin(collabnet["PointID"]) -! ] - - -> def filter_by_valid_measuring_agency(df: pd.DataFrame) -> pd.DataFrame: -! path = get_transfers_data_path("valid_measuring_agency.csv") - -! with open(path, "r") as f: -! reader = csv.reader(f) -! _ = next(reader) -! valid_measuring_agencies = [row[0] for row in reader if row[1] == "Yes"] -! logger.info("Valid Measuring Agencies:") -! for vma in valid_measuring_agencies: -! logger.info(f" {vma}") -! return df[df["MeasuringAgency"].isin(valid_measuring_agencies)] - - -> def filter_to_valid_point_ids(session: Session, df: pd.DataFrame) -> pd.DataFrame: -! valid_point_ids = get_valid_point_ids(session) -! return df[df["PointID"].isin(valid_point_ids)] - - -> def convert_mt_to_utc(dt_record: datetime): -! t = dt_record.time() -! if t.hour == 0 and t.minute == 0: - # no time was measured, so just set the timezone to UTC and keep - # time at 00:00 -! dt_record = dt_record.replace(tzinfo=timezone.utc) -! else: -! tz = pytz.timezone("America/Denver") -! dt_record = tz.localize(dt_record) -! if dt_record.dst() == timedelta(0): - # MST -! utc_offset = 7 -! else: - # MDT -! utc_offset = 6 -! dt_record = dt_record - timedelta(hours=utc_offset) -! dt_record = dt_record.replace(tzinfo=timezone.utc) -! return dt_record - - -> def chunk_by_size(df, chunk_size): -! for i in range(0, len(df), chunk_size): -! yield df.iloc[i : i + chunk_size] - - -> def make_location(row: pd.Series, elevations: dict) -> tuple: -> """ -> Returns a tuple of location data and the elevation method -> """ -> point = Point(row.Easting, row.Northing) - - # Convert the point to a WGS84 coordinate system -> transformed_point = transform_srid( -> point, source_srid=SRID_UTM_ZONE_13N, target_srid=SRID_WGS84 -> ) - -> z = row.Altitude -> if z: -> elevation_from_epqs = False -> z = convert_ft_to_m(z) - -> if row.AltDatum == "NGVD29": -! key = f"{row.PointID}, {transformed_point.x, transformed_point.y}" -! if key in elevations: -! z = elevations[key] -! else: -! z = convert_ngvd29_to_navd88( -! z, transformed_point.x, transformed_point.y -! ) -! elevations[key] = z -! else: -! elevation_from_epqs = True -! logger.info( -! f"Location {row.PointID} has no Altitude. Setting from National Map EPQS for " -! ) -! z = get_epqs_elevation_from_point(transformed_point.x, transformed_point.y) - -> if elevation_from_epqs: -! elevation_method = "USGS National Elevation Dataset (NED)" -> elif pd.isna(row.AltitudeMethod): -! elevation_method = None -> else: -> elevation_method = lexicon_mapper.map_value( -> f"LU_AltitudeMethod:{row.AltitudeMethod.strip()}" -> ) - - # Extract AMPAPI date fields (Date type, not DateTime) -> nma_date_created = None -> if row.DateCreated: -> nma_date_created = datetime.strptime( -> row.DateCreated, "%Y-%m-%d %H:%M:%S.%f" -> ).date() - -> nma_site_date = None -> if row.SiteDate: -> nma_site_date = datetime.strptime(row.SiteDate, "%Y-%m-%d %H:%M:%S.%f").date() - -> location = Location( -> nma_pk_location=row.LocationId, -> point=transformed_point.wkt, -> elevation=z, -> release_status="public" if row.PublicRelease else "private", -> nma_coordinate_notes=row.CoordinateNotes, -> nma_notes_location=row.LocationNotes, -> nma_date_created=nma_date_created, -> nma_site_date=nma_site_date, -> ) - -> return location, elevation_method - - -> def make_location_data_provenance( -> row: pd.Series, location: Location, elevation_method: str | None -> ) -> list[DataProvenance]: -! provenance_records = [] - -! if row.AltitudeAccuracy or row.CoordinateAccuracy: -! provenance = DataProvenance( -! target_id=location.id, -! target_table="location", -! field_name="elevation", -! origin_source=None, -! collection_method=elevation_method, -! accuracy_value=( -! None -! if pd.isna(row.AltitudeAccuracy) -! else convert_ft_to_m(row.AltitudeAccuracy) -! ), -! accuracy_unit="m", -! ) -! provenance_records.append(provenance) - - # TODO: AMP feedback is required for transfering coordinate accuracy values - # from NM_Aquifer to Ocotillo - # if row.CoordinateAccuracy == "U" or pd.isna(row.CoordinateAccuracy): - # # map "Unknown" to None - # row.CoordinateAccuracy = None - # elif row.CoordinateAccuracy == "5m": - # row.CoordinateAccuracy = 5.0 - # else: - # seconds = 0 - # minutes = 0 - # if row.CoordinateAccuracy == "1": - # seconds = 0.1 - # elif row.CoordinateAccuracy == "5": - # seconds = 0.5 - # elif row.CoordinateAccuracy == "F": - # seconds = 5 - # elif row.CoordinateAccuracy == "H": - # seconds = 0.01 - # elif row.CoordinateAccuracy == "M": - # minutes = 1 - # elif row.CoordinateAccuracy == "R": - # seconds = 3 - # elif row.CoordinateAccuracy == "S": - # seconds = 1 - # else: - # seconds = 10 - # coordinate_accuracy_decimal_deg = minutes/60 + seconds / 3600 - - # """ - # Developer's notes - - # To convert accuracy from decimal degrees to meters we do the following: - - # 1. Add the coordinate accuracy to both the latitude and longitude to - # find the "+" distance from the location - # 2. Convert "+" accuracy coordinates from decimal degrees to UTM Zone 13 - # N - # 3. Find the distance in meters from the original Easting/Northing and - # define this as the "+" accuracy in meters - # 4. Subtract the coordinate accuracy to both the latitude and longitude - # to find the "-" distance from the location - # 5. Convert the "-" accuracy coordinates from decimal degrees to UTM Zone - # 13 N - # 6. Find the distance in meters from the original Easting/Northing and - # define this as the "-" accuracy in meters - # 7. Set the coordinate accuracy in meters as the mean of the "+" and "-" - # distances from the location - # """ - # original_longitude = transformed_point.x - # original_latitude = transformed_point.y - - # plus_longitude = original_longitude + coordinate_accuracy_decimal_deg - # plus_latitude = original_latitude + coordinate_accuracy_decimal_deg - # plus_point_decimal_deg = Point(plus_longitude, plus_latitude) - # plus_point_utm_zone_13_n = transform_srid( - # plus_point_decimal_deg, - # SRID_WGS84, - # SRID_UTM_ZONE_13N) - - # minus_longitude = original_longitude - coordinate_accuracy_decimal_deg - # minus_latitude = original_latitude - coordinate_accuracy_decimal_deg - # minus_point_decimal_deg = Point(minus_longitude, minus_latitude) - -! if row.CoordinateMethod or row.CoordinateAccuracy: -! coordinate_method = ( -! lexicon_mapper.map_value(f"LU_CoordinateMethod:{row.CoordinateMethod}") -! if not pd.isna(row.CoordinateMethod) -! else None -! ) - -! accuracy_value, accuracy_unit = NMA_COORDINATE_ACCURACY.get( -! row.CoordinateAccuracy, (None, None) -! ) - -! provenance = DataProvenance( -! target_id=location.id, -! target_table="location", -! field_name="point", -! origin_source=None, -! collection_method=coordinate_method, -! accuracy_value=accuracy_value, -! accuracy_unit=accuracy_unit, -! ) -! provenance_records.append(provenance) - -! return provenance_records - - -> def timeit_direct(func, *args, **kwargs): -! start = datetime.now() -! result = func(*args, **kwargs) -! end = datetime.now() -! logger.info(f"TIMING: {func.__name__} took {(end - start).total_seconds()} seconds") -! return result - - -> def timeit(func): -! def wrapper(*args, **kwargs): -! return timeit_direct(func, *args, **kwargs) - -! return wrapper - - -> class LexiconMapper: -> def __init__(self): -> self._mappers = None - -> def map_value(self, value): -! value = value.strip() -! return self._make_lu_to_lexicon_mapper().get(value, value) - -> def _make_lu_to_lexicon_mapper(self): -! if self._mappers: -! return self._mappers - - # Lookup tables where CODE maps to MEANING -! lu_tables = [ -! "LU_AltitudeMethod", -! "LU_CollectionMethod", -! "LU_ConstructionMethod", -! "LU_CoordinateAccuracy", -! "LU_CoordinateMethod", -! "LU_CurrentUse", -! "LU_DataQuality", -! "LU_DataSource", -! "LU_Depth_CompletionSource", -! "LU_Discharge_ChemistrySource", -! "LU_LevelStatus", -! "LU_MajorAnalyte", -! "LU_MeasurementMethod", -! "LU_MinorTraceAnalyte", -! "LU_MonitoringStatus", -! "LU_SampleType", -! "LU_SiteType", -! "LU_Status", -! ] - - # Lookup tables intentionally skipped (kept for documentation only) - # Each entry explains why the table is excluded -! _lu_tables_skipped = { -! "LU_AltitudeDatum": "code is the value, so no need for mapping", -! "LU_CoordinateDatum": "code is the value, so no need for mapping", -! "LU_FieldNoteTypes": "not being used in the transfers since there are no records", -! "LU_Formations": "needs to be cleaned before it can be used", -! "LU_Lithology": "needs to be cleaned before it can be used", -! "LU_MeasuringAgency": "the abbreviation is what is used in the new schema", -! } -! mappers = {} - -! for lu_table in lu_tables: -! table = read_csv(lu_table) - -! for i, row in table.iterrows(): -! if lu_table == "LU_Formations": -! code = row.Code -! meaning = row.Meaning -! else: -! code = row.CODE -! meaning = row.MEANING - -! mappers.update({f"{lu_table}:{code}": meaning}) -! self._mappers = mappers -! return mappers - - -> lexicon_mapper = LexiconMapper() - - - # ============= EOF ============================================= From 027299060f94d38819f20412f931060ad6ec372d Mon Sep 17 00:00:00 2001 From: Kimball Bighorse Date: Wed, 3 Dec 2025 01:51:29 -0800 Subject: [PATCH 54/66] Delete .coverage --- .coverage | Bin 53248 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 .coverage diff --git a/.coverage b/.coverage deleted file mode 100644 index 5417251745baf9f63193ac63e03d4d4b0edab5c4..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 53248 zcmeI)&u$Y(90%}S+t^vh)&|kYiXxQhftrXU#_geMdw@`)3Y8WpX)l#9V|$!z@$Q=4 zH6(|sHB>@E>Kjyj1o{X(L!W>iIrY{{p;CXpon8M4HsVkeNWUxF**`P0GoSfq$JSiG zcG-!P==gyvBk_T;V3?+HUI@c5EP9sdnQjHz$)yMM*BslQwQCt058o-<-x#I*&xU=Y ze8pZb{Zf8!{`Zn=J)HlwAgvZ%zy<*bKmY>&Z-K$JQlYZ6WZwHelFhD)0@+f5el36c z)z;?rme}6BaCuAUbK-PS(6+uVHbvmy6+IP*j?-1b@!C#HMvk{5qFt4S?uV+)6CItS zsg46~#PNBr;j}4Mq;@EYUf{Sg*cZ3d{#qO$xm%?6B7K4qQO=IXA;jq<`l<+2M+M4j zsZi(Qw9{TKuHOHlSg4#hVLniCnPkwS$NWTT=)|ZfmpYVC%MaR|`(_}$)~*WIgdCQz zuE4uQ8ob z^XT_WqH}aFhzH;46smoD$ zC#o|g=Ni?q`kr&UuZESsCPp#W;$nFIn&3iE3ph2OuY7dEG^kaoz3_II-fuNzKl0<_ z233BePA{dOj?Wb;D=X%M+i^o-o<`G&!r_iJ(P)OJXEYtYKD*s;FPUs4+>5$h;dkOL z!?A^~9BR$ueWsRT$bdVV@RNX)(?e}-oLS0)Xm-4=Aca@OK z(DLu7K<=oGy)2Une%GS!PrZa2;jrwr6I9SRTD$C2}Ju7ZTNo^&63KLBz zt;s3FXnM(2GVBNGK=QIdGL11BCDT5VT%qEu7xII7 zgFZjA6Itdnqbz@}_Sf&oE{!HV=>*heX$;mgfZ%{_Nr7e4??-X_<-@pn^4v4ZuTEfe zoZ3^U)i^kH@2h?oIy9~Fv=x4wo)y(y8S*F;Rz=hIyGnX`ZlD;KLb|6DiZ~B1&PwM- zX@vWn#?GB;oTqeh;bDH8I$heoXvcM>*G}duo`!X*H1>1nDZ5=-kfp6^w0?oS#0wY&wC<7beHQrrUhT^-u1mMqRhNBU4T%;zsnJqKUP#Ln zg|&X3QT4FV8=00bZa0SG_<0uX=z1Rwx`H$&iRZg9%_FF*fd{l7P( zVv!37KmY;|fB*y_009U<00Izzz-to7npvZ)zyE)3*w0_nU63pUAOHafKmY;|fB*y_ z009U<00M_9kj<7Y{r&%A!+w0IS;5UA009U<00Izz00bZa0SG_<0uY!jaLzntSp&NM z_=)xR%Dc3)aMAeDSjZX1oI!^d+Bb~zOHcl(7RJ*MqIX;$=k)jg&kXz7cmV7}00Izz z00bZa0SG_<0uX=z1R!uk1adjciogFi7LG`XPz(q_00Izz00bZa0SG_<0uX=z1f~k` z_y73(Ka~b2ApijgKmY;|fB*y_009U<00Kuw0PFuBnW{uVAOHafKmY;|fB*y_009U< M00Pql;`RUk1r+UXZU6uP From f0e730c2f06f1b2abce75f43840f482ebbc8e4c8 Mon Sep 17 00:00:00 2001 From: Kimball Bighorse Date: Wed, 3 Dec 2025 01:53:56 -0800 Subject: [PATCH 55/66] Remove noisy EOF --- tests/test_thing.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/test_thing.py b/tests/test_thing.py index 3792b4302..28290dada 100644 --- a/tests/test_thing.py +++ b/tests/test_thing.py @@ -1130,6 +1130,3 @@ def test_delete_thing_id_link_404_not_found(second_thing_id_link): assert response.status_code == 404 data = response.json() assert data["detail"] == f"ThingIdLink with ID {bad_id} not found." - - -# ============= EOF ============================================= From 070fcbae2dd849ba386f52a5201452634abdba03 Mon Sep 17 00:00:00 2001 From: Kimball Bighorse Date: Wed, 3 Dec 2025 01:58:00 -0800 Subject: [PATCH 56/66] Simplify error message --- tests/features/steps/post_migration_legacy_data.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/features/steps/post_migration_legacy_data.py b/tests/features/steps/post_migration_legacy_data.py index 3baa7f5f3..185b1a758 100644 --- a/tests/features/steps/post_migration_legacy_data.py +++ b/tests/features/steps/post_migration_legacy_data.py @@ -401,8 +401,7 @@ def step_then_created_at_recent(context: Context): if created_at.tzinfo is None: raise AssertionError( "created_at is a naive datetime (no timezone info). " - "Ensure the database and ORM are configured to return timezone-aware datetimes in UTC. " - "AutoBaseMixin.created_at uses DateTime(timezone=True) with server_default=func.timezone('UTC', func.now())" + "Check ORM/database config for timezone-aware UTC datetimes (see AutoBaseMixin.created_at)." ) diff_seconds = abs((now - created_at).total_seconds()) From f3e9587ad96bf88b01340f532a26a48a92347ec1 Mon Sep 17 00:00:00 2001 From: Kimball Bighorse Date: Wed, 3 Dec 2025 01:59:45 -0800 Subject: [PATCH 57/66] Remove unnecessary conditionals --- transfers/util.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/transfers/util.py b/transfers/util.py index 5216c204f..876e142fc 100644 --- a/transfers/util.py +++ b/transfers/util.py @@ -64,10 +64,7 @@ def read_csv(name: str, dtype: dict | None = None) -> pd.DataFrame: if local_file.exists(): logger.info(f"Reading {name} from local file: {local_file}") - if dtype: - return pd.read_csv(local_file, dtype=dtype) - else: - return pd.read_csv(local_file) + return pd.read_csv(local_file, dtype=dtype) # Check cache directory p = get_transfers_data_path(Path("nma_csv_cache") / f"{name}.csv") @@ -83,10 +80,7 @@ def read_csv(name: str, dtype: dict | None = None) -> pd.DataFrame: with open(p, "wb") as f: f.write(data) - if dtype: - return pd.read_csv(io.BytesIO(data), dtype=dtype) - else: - return pd.read_csv(io.BytesIO(data)) + return pd.read_csv(io.BytesIO(data), dtype=dtype) def get_valid_point_ids(session, thing_type="water well"): From 56694a3cc6c2ad88b4c3c52012b411e54e5ed4d3 Mon Sep 17 00:00:00 2001 From: jakeross Date: Wed, 3 Dec 2025 10:47:03 -0700 Subject: [PATCH 58/66] feat: update sensor type handling to support multiple sensor types in water levels transfer --- core/lexicon.json | 3 +++ transfers/waterlevels_transducer_transfer.py | 12 ++++++------ 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/core/lexicon.json b/core/lexicon.json index 142f1745c..bec62b46e 100644 --- a/core/lexicon.json +++ b/core/lexicon.json @@ -676,6 +676,9 @@ {"categories": ["parameter_type"], "term": "Major Element", "definition": "Major Element"}, {"categories": ["parameter_type"], "term": "Minor Element", "definition": "Minor Element"}, {"categories": ["parameter_type"], "term": "Physical property", "definition": "Physical property"}, + + {"categories": ["sensor_type"], "term": "DiverLink", "definition": "DiverLink"}, + {"categories": ["sensor_type"], "term": "Diver Cable", "definition": "Diver Cable"}, {"categories": ["sensor_type"], "term": "Pressure Transducer", "definition": "Pressure Transducer"}, {"categories": ["sensor_type"], "term": "Data Logger", "definition": "Data Logger"}, {"categories": ["sensor_type"], "term": "Barometer", "definition": "Barometer"}, diff --git a/transfers/waterlevels_transducer_transfer.py b/transfers/waterlevels_transducer_transfer.py index 74eaafd06..cd323330c 100644 --- a/transfers/waterlevels_transducer_transfer.py +++ b/transfers/waterlevels_transducer_transfer.py @@ -33,13 +33,13 @@ class WaterLevelsContinuousTransferer(Transferer): _partition_field: str - _sensor_type: str + _sensor_types: tuple[str] def __init__(self, *args, **kw): super().__init__(*args, **kw) self.groundwater_parameter_id = get_groundwater_parameter_id() - if self._sensor_type is None: - raise ValueError("_sensor_type must be set") + if self._sensor_types is None: + raise ValueError("_sensor_types must be set") if self._partition_field is None: raise ValueError("_partition_field must be set") @@ -66,7 +66,7 @@ def _transfer_hook(self, session: Session) -> None: session.query(Deployment) .join(Thing) .join(Sensor) - .where(Sensor.sensor_type == self._sensor_type) + .where(Sensor.sensor_type.in_(self._sensor_types)) .where(Thing.name == pointid) .all() ) @@ -185,13 +185,13 @@ def _make_observation( class WaterLevelsContinuousPressureTransferer(WaterLevelsContinuousTransferer): source_table = "WaterLevelsContinuous_Pressure" _partition_field = "QCed" - _sensor_type = "Pressure Transducer" + _sensor_types = ("Pressure Transducer", "Barometer", "DiverLink", "Diver Cable") class WaterLevelsContinuousAcousticTransferer(WaterLevelsContinuousTransferer): source_table = "WaterLevelsContinuous_Acoustic" _partition_field = "PublicRelease" - _sensor_type = "Acoustic Sounder" + _sensor_types = ("Acoustic Sounder",) def _find_deployment(ts, deployments): From 1101e2e1b388ded28a8839f8a4f5c78031186598 Mon Sep 17 00:00:00 2001 From: jross Date: Wed, 3 Dec 2025 12:22:02 -0700 Subject: [PATCH 59/66] feat: refactor well transfer logic to use bulk save for improved performance and error handling --- transfers/transfer.py | 2 +- transfers/well_transfer.py | 38 ++++++++++++++++++++------------------ 2 files changed, 21 insertions(+), 19 deletions(-) diff --git a/transfers/transfer.py b/transfers/transfer.py index bf0c69b85..45a78cc60 100644 --- a/transfers/transfer.py +++ b/transfers/transfer.py @@ -57,7 +57,7 @@ def message(msg, pad=10, new_line_at_top=True): @timeit def transfer_all(metrics, limit=100): message("STARTING TRANSFER", new_line_at_top=False) - if int(os.environ.get("ERASE_AND_REBUILD", 0)): + if get_bool_env("ERASE_AND_REBUILD", False): logger.info("Erase and rebuilding database") erase_and_rebuild_db() diff --git a/transfers/well_transfer.py b/transfers/well_transfer.py index 314593250..754536e41 100644 --- a/transfers/well_transfer.py +++ b/transfers/well_transfer.py @@ -425,6 +425,9 @@ def _step(self, session: Session, df: pd.DataFrame, i: int, row: pd.Series): if well is not None: session.expunge(well) + if location is not None: + session.delete(location) + self._capture_error(row.PointID, str(e), "UnknownField") logger.critical(f"Error creating well for {row.PointID}: {e}") @@ -588,19 +591,19 @@ def _after_hook(self, session): query = session.query(Thing).filter(Thing.thing_type == "water well") count = query.count() for i, well in enumerate(query.all()): + objs = [] step_start_time = time.time() row = self.cleaned_df[self.cleaned_df["PointID"] == well.name].iloc[0] if notna(row.Notes): note = well.add_note(row.Notes, "Other") - session.add(note) + objs.append(note) location = well.current_location elevation_method = self._added_locations[row.PointID] data_provenances = make_location_data_provenance( row, location, elevation_method ) - for dp in data_provenances: - session.add(dp) + objs.extend(data_provenances) for row_field, kw in ( ( @@ -631,15 +634,9 @@ def _after_hook(self, session): ): if notna(row[row_field]): - try: - dp = DataProvenance( - target_id=well.id, target_table="thing", **kw - ) - session.add(dp) - session.commit() - except DatabaseError as e: - self._capture_error(row.PointID, str(e), "DataProvenance") - session.rollback() + dp = DataProvenance(target_id=well.id, target_table="thing", **kw) + objs.append(dp) + start_time = time.time() mphs = measuring_point_estimator.estimate_measuring_point_height(row) logger.info( @@ -654,7 +651,7 @@ def _after_hook(self, session): start_date=start_date, end_date=end_date, ) - session.add(measuring_point_history) + objs.append(measuring_point_history) """ Developer's notes @@ -686,7 +683,7 @@ def _after_hook(self, session): target_id=target_id, target_table=target_table, ) - session.add(status_history) + objs.append(status_history) logger.info( f" Added monitoring status for well {well.name}: {status_value}" ) @@ -700,7 +697,8 @@ def _after_hook(self, session): start_date=datetime.now(tz=UTC), end_date=None, ) - session.add(monitoring_frequency_history) + + objs.append(monitoring_frequency_history) logger.info( f" Adding '{monitoring_frequency}' monitoring frequency for well {well.name}" ) @@ -715,15 +713,19 @@ def _after_hook(self, session): target_id=target_id, target_table=target_table, ) - session.add(status_history) + objs.append(status_history) logger.info(f" Added well status for well {well.name}: {status_value}") + try: + session.bulk_save_objects(objs) + except DatabaseError as e: + session.rollback() + error_dict = e.orig.args[0] + self._capture_error(well.name, error_dict["D"], error_dict["t"]) logger.info( f"After hook: {well.name} {i+1}/{count} took {time.time() - step_start_time:.2f}s" ) - session.commit() - class WellChunkTransferer(ChunkTransferer): source_table: str = None From be3a11d604fff7720da62711d3a82b58c0c80cd0 Mon Sep 17 00:00:00 2001 From: jross Date: Wed, 3 Dec 2025 12:38:56 -0700 Subject: [PATCH 60/66] refactor: rename regex pattern for pump types and simplify extraction logic --- transfers/well_transfer.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/transfers/well_transfer.py b/transfers/well_transfer.py index 754536e41..d92f2ece6 100644 --- a/transfers/well_transfer.py +++ b/transfers/well_transfer.py @@ -125,26 +125,27 @@ def _extract_casing_materials(row) -> list[str]: return materials -pattern = re.compile( +PUMP_PATTERN = re.compile( r"\b(?Pjet|hand|submersible)\b|\b(?Pline[-\s]+shaft)\b", re.IGNORECASE ) def first_matched_term(text: str): - m = pattern.search(text) + m = PUMP_PATTERN.search(text) if not m: return None return m.group("term") or m.group("phrase") -PUMP_MAPPING = {"jet": "Jet", "hand": "Hand", "submersible": "Submersible"} - - def _extract_well_pump_type(row) -> str | None: if isna(row.ConstructionNotes): return None construction_notes = row.ConstructionNotes.lower() - return PUMP_MAPPING.get(first_matched_term(construction_notes), None) + pump = first_matched_term(construction_notes) + if pump: + return pump.capitalize() + else: + return None # Parse aquifer codes From 4b6d8f280f97e24152783dc96552e8e0cace752b Mon Sep 17 00:00:00 2001 From: jross Date: Wed, 3 Dec 2025 14:25:46 -0700 Subject: [PATCH 61/66] feat: add organization mapping functionality and update contact transfer logic --- core/lexicon.json | 76 +++++++++++++++++++++++ transfers/contact_transfer.py | 42 ++++++++++--- transfers/data/organization_mapping.json | 79 ++++++++++++++++++++++++ 3 files changed, 190 insertions(+), 7 deletions(-) create mode 100644 transfers/data/organization_mapping.json diff --git a/core/lexicon.json b/core/lexicon.json index bec62b46e..35f949802 100644 --- a/core/lexicon.json +++ b/core/lexicon.json @@ -373,6 +373,82 @@ {"categories": ["analysis_method_type"], "term": "Laboratory", "definition": "A procedure performed on a physical sample in a controlled, off-site laboratory environment. These methods typically involve complex instrumentation, standardized reagents, and formal quality control protocols."}, {"categories": ["analysis_method_type"], "term": "Field Procedure", "definition": "A standardized procedure performed on-site at the time of sample collection. This can involve direct measurement of the environmental medium using a calibrated field instrument or a specific, documented technique for collecting a sample."}, {"categories": ["analysis_method_type"], "term": "Calculation", "definition": "A mathematical procedure used to derive a new data point from one or more directly measured values. This type is used to document the provenance of calculated data, providing an auditable trail."}, + {"categories":["organization"],"term":"City of Aztec","definition":"City of Aztec"}, + {"categories":["organization"],"term":"Daybreak Investments","definition":"Daybreak Investments"}, + {"categories":["organization"],"term":"Vallecitos HOA","definition":"Vallecitos HOA"}, + {"categories":["organization"],"term":"Naiche Development","definition":"Naiche Corporation"}, + {"categories":["organization"],"term":"Santa Fe County; Santa Fe Animal Shelter","definition":"Santa Fe County; Santa Fe Animal Shelter"}, + {"categories":["organization"],"term":"El Guicu Ditch Association","definition":"El Guicu Ditch Association"}, + {"categories":["organization"],"term":"Santa Fe Municipal Airport","definition":"Santa Fe Municipal Airport"}, + {"categories":["organization"],"term":"Uluru Development","definition":"Uluru Development"}, + {"categories":["organization"],"term":"AllSup's Convenience Stores","definition":"AllSup's Convenience Stores"}, + {"categories":["organization"],"term":"Santa Fe Downs","definition":"Santa Fe Downs Resort"}, + {"categories":["organization"],"term":"City of Truth or Consequences, WWTP","definition":"City of Truth or Consequences, WWTP"}, + {"categories":["organization"],"term":"Riverbend Hotsprings","definition":"Riverbend Hotsprings"}, + {"categories":["organization"],"term":"Armendaris Ranch","definition":"Armendaris Ranch"}, + {"categories":["organization"],"term":"El Paso Water","definition":"El Paso Water"}, + {"categories":["organization"],"term":"BLM, Socorro Field Office","definition":"BLM, Socorro Field Office"}, + {"categories":["organization"],"term":"USFWS","definition":"US Fish & Wildlife Service"}, + {"categories":["organization"],"term":"NPS","definition":"National Park Service"}, + {"categories":["organization"],"term":"Sile MDWCA","definition":"Sile Municipal Domestic Water Assn."}, + {"categories":["organization"],"term":"Pena Blanca Water & Sanitation District","definition":"Pena Blanca Water & Sanitation District"}, + {"categories":["organization"],"term":"Town of Questa","definition":"Town of Questa"}, + {"categories":["organization"],"term":"Lamy MDWCA","definition":"Lama MDWCA"}, + {"categories":["organization"],"term":"Town of Cerro","definition":"Town of Cerro"}, + {"categories":["organization"],"term":"Farr Cattle Company","definition":"Farr Cattle Company (Farr Ranch)"}, + {"categories":["organization"],"term":"Carrizozo Orchard","definition":"Carrizozo Orchard"}, + {"categories":["organization"],"term":"USFS, Kiowa Grasslands","definition":"USFS, Kiowa Grasslands"}, + {"categories":["organization"],"term":"Cloud Country West Subdivision","definition":"Cloud Country West Subdivision"}, + {"categories":["organization"],"term":"Chama West Water Users Association","definition":"Chama West Water Users Assn."}, + {"categories":["organization"],"term":"El Rito Regional Water and Waste Water Association","definition":"El Rito Regional Water + Waste Water Association"}, + {"categories":["organization"],"term":"West Rim MDWUA","definition":"West Rim MDWUA"}, + {"categories":["organization"],"term":"Village of Willard","definition":"Village of Willard"}, + {"categories":["organization"],"term":"Quemado Municipal Water & SWA","definition":"Quemado Mutual Water and Sewage Works Association"}, + {"categories":["organization"],"term":"Coyote Creek MDWUA","definition":"Coyote Creek MDWUA"}, + {"categories":["organization"],"term":"Lamy MDWCA","definition":"Lamy Mutual Domestic Water Assn."}, + {"categories":["organization"],"term":"La Joya CWDA","definition":"La Joya CWDA"}, + {"categories":["organization"],"term":"NM Firefighters Training Academy","definition":"NM Firefighters Training Academy"}, + {"categories":["organization"],"term":"Cebolleta Land Grant","definition":"Cebolleta Land Grant"}, + {"categories":["organization"],"term":"Madrid Water Co-op","definition":"Madrid Water Co-op"}, + {"categories":["organization"],"term":"Sun Valley Water and Sanitation","definition":"Sun Valley Water and Sanitation"}, + {"categories":["organization"],"term":"Bluewater Lake MDWCA","definition":"Bluewater Lake MDWCA"}, + {"categories":["organization"],"term":"Bluewater Acres Domestic WUA","definition":"Bluewater Acres Domestic Water Users Assn."}, + {"categories":["organization"],"term":"Lybrook MDWCA","definition":"Lybrook Municipal"}, + {"categories":["organization"],"term":"New Mexico Museum of Natural History","definition":"New Mexico Museum of Natural History"}, + {"categories":["organization"],"term":"Hillsboro MDWCA","definition":"Hillsboro Mutual Domestic Water Consumer Assn."}, + {"categories":["organization"],"term":"Tyrone MDWCA","definition":"Tyrone Mutual Domestic Water Assn."}, + {"categories":["organization"],"term":"Santa Clara Water System","definition":"Santa Clara Water System"}, + {"categories":["organization"],"term":"Casas Adobes MDWCA","definition":"Casas Adobes Mutual Domestic"}, + {"categories":["organization"],"term":"Lake Roberts WUA","definition":"Lake Roberts Water Assn."}, + {"categories":["organization"],"term":"El Creston MDWCA","definition":"El Creston MDWCA"}, + {"categories":["organization"],"term":"Reserve Municipality Water Works","definition":"Reserve Municipality Water Works"}, + {"categories":["organization"],"term":"Bayard","definition":"Bayard Municipal Water"}, + {"categories":["organization"],"term":"Town of Estancia","definition":"Town of Estancia"}, + {"categories":["organization"],"term":"Pie Town MDWCA","definition":"Pie Town MDWCA"}, + {"categories":["organization"],"term":"Roosevelt SWCD","definition":"Roosevelt Soil & Water Conservation District"}, + {"categories":["organization"],"term":"Otis MDWCA","definition":"Otis Mutual Domestic"}, + {"categories":["organization"],"term":"White Cliffs MDWUA","definition":"White Cliffs MDWUA"}, + {"categories":["organization"],"term":"Vista Linda Water Co-op","definition":"Vista Linda Water Co-op"}, + {"categories":["organization"],"term":"Anasazi Trails Water Co-op","definition":"Anasazi Trails Water Cooperative"}, + {"categories":["organization"],"term":"Canon MDWCA","definition":"Canon Mutual Domestic Water Consumer Assn."}, + {"categories":["organization"],"term":"Placitas Trails Water Co-op","definition":"Placitas Trails Water Coop"}, + {"categories":["organization"],"term":"BLM, Roswell Office","definition":"BLM, Roswell Office"}, + {"categories":["organization"],"term":"Forked Lightning Ranch","definition":"Forked Lightning Ranch"}, + {"categories":["organization"],"term":"Cottonwood RWA","definition":"Cottonwood Rural Water Assn."}, + {"categories":["organization"],"term":"Pinon Ridge WUA","definition":"Pinon Ridge Water Users Association"}, + {"categories":["organization"],"term":"McSherry Farms","definition":"McSherry Farms"}, + {"categories":["organization"],"term":"Agua Sana WUA","definition":"Agua Sana Water Users Assn."}, + {"categories":["organization"],"term":"Chamita MDWCA","definition":"Chamita Water Users Association"}, + {"categories":["organization"],"term":"W Spear-bar Ranch","definition":"W Spear-bar Ranch"}, + {"categories":["organization"],"term":"Village of Capitan","definition":"Village of Capitan"}, + {"categories":["organization"],"term":"Brazos MDWCA","definition":"Brazos Mutual Domestic Water Consumers Assn."}, + {"categories":["organization"],"term":"Alto Alps HOA","definition":"Alto Alps Homeowners Association"}, + {"categories":["organization"],"term":"Chiricahua Desert Museum","definition":"Chiricahua Desert Museum"}, + {"categories":["organization"],"term":"Bike Ranch","definition":"Bike Ranch"}, + {"categories":["organization"],"term":"Hachita MDWCA","definition":"Hachita MDWCA"}, + {"categories":["organization"],"term":"Carrizozo Municipal Water","definition":"Carrizozo Municipal Water"}, + {"categories":["organization"],"term":"Dunhill Ranch","definition":"Dunhill Ranch"}, + {"categories":["organization"],"term":"Santa Fe Conservation Trust","definition":"Santa Fe Conservation Trust"}, {"categories": ["organization"], "term": "NMSU", "definition": "New Mexico State University"}, {"categories": ["organization"], "term": "USGS", "definition": "US Geological Survey"}, {"categories": ["organization"], "term": "TWDB", "definition": "Texas Water Development Board"}, diff --git a/transfers/contact_transfer.py b/transfers/contact_transfer.py index a1d545a03..1c690e0ce 100644 --- a/transfers/contact_transfer.py +++ b/transfers/contact_transfer.py @@ -20,6 +20,7 @@ from pydantic import ValidationError from sqlalchemy.orm import Session +from core.enums import Organization from db import ( Contact, ThingContactAssociation, @@ -48,6 +49,10 @@ def __init__(self, *args, **kw): with open(co_to_org_mapper_path, "r") as f: self._co_to_org_mapper = json.load(f) + organization_mapper_path = get_transfers_data_path("organization_mapping.json") + with open(organization_mapper_path, "r") as f: + self._organization_mapper = json.load(f) + self._added = [] def _get_dfs(self): @@ -74,7 +79,14 @@ def _group_step(self, session: Session, row: pd.Series, db_item: Base): "second", ): try: - if adder(session, row, db_item, self._co_to_org_mapper, self._added): + if adder( + session, + row, + db_item, + self._co_to_org_mapper, + self._organization_mapper, + self._added, + ): session.commit() logger.info(f"added {tag} contact for PointID {row.PointID}") except ValidationError as e: @@ -90,7 +102,7 @@ def _group_step(self, session: Session, row: pd.Series, db_item: Base): self._capture_error(row.PointID, str(e), "UnknownError") -def _add_first_contact(session, row, thing, co_to_org_mapper, added): +def _add_first_contact(session, row, thing, co_to_org_mapper, org_mapper, added): # TODO: extract role from OwnerComment # role = extract_owner_role(row.OwnerComment) role = "Owner" @@ -98,10 +110,10 @@ def _add_first_contact(session, row, thing, co_to_org_mapper, added): name = _make_name(row.FirstName, row.LastName) - organization = co_to_org_mapper.get(row.Company, row.Company) - + # check if organization is in lexicon + organization = _get_organization(row, co_to_org_mapper, org_mapper) if (name, organization) in added: - return + return None added.append((name, organization)) contact_data = { @@ -190,7 +202,22 @@ def _add_first_contact(session, row, thing, co_to_org_mapper, added): return True -def _add_second_contact(session, row, thing, co_to_org_mapper, added): +def _get_organization(row, co_to_org_mapper, org_mapper): + organization = co_to_org_mapper.get(row.Company, row.Company) + + try: + Organization(organization) + except ValueError: + norganization = next( + (k for k, v in org_mapper.items() if v == organization), None + ) + logger.warning(f"mapping {organization} to {norganization}") + organization = norganization + + return organization + + +def _add_second_contact(session, row, thing, co_to_org_mapper, org_mapper, added): if all( [ getattr(row, f"Second{f}") is None @@ -203,9 +230,10 @@ def _add_second_contact(session, row, thing, co_to_org_mapper, added): release_status = "private" name = _make_name(row.SecondFirstName, row.SecondLastName) - organization = co_to_org_mapper.get(row.Company, row.Company) + organization = _get_organization(row, co_to_org_mapper, org_mapper) if (name, organization) in added: return + added.append((name, organization)) contact_data = { diff --git a/transfers/data/organization_mapping.json b/transfers/data/organization_mapping.json new file mode 100644 index 000000000..0d3bda9dc --- /dev/null +++ b/transfers/data/organization_mapping.json @@ -0,0 +1,79 @@ +{ + "City of Aztec": "City of Aztec", + "Daybreak Investments": "Daybreak Investments", + "Vallecitos HOA": "Vallecitos HOA", + "Naiche Development": "Naiche Corporation", + "Santa Fe County; Santa Fe Animal Shelter": "Santa Fe County; Santa Fe Animal Shelter", + "El Guicu Ditch Association": "El Guicu Ditch Association", + "Santa Fe Municipal Airport": "Santa Fe Municipal Airport", + "Uluru Development": "Uluru Development", + "AllSup's Convenience Stores": "AllSup's Convenience Stores", + "Santa Fe Downs": "Santa Fe Downs Resort", + "City of Truth or Consequences, WWTP": "City of Truth or Consequences, WWTP", + "Riverbend Hotsprings": "Riverbend Hotsprings", + "Armendaris Ranch": "Armendaris Ranch", + "El Paso Water": "El Paso Water", + "PVACD": "Pecos Valley Artesian Conservancy District", + "BLM, Socorro Field Office": "BLM, Socorro Field Office", + "USFWS": "US Fish & Wildlife Service", + "NPS": "National Park Service", + "Sile MDWCA": "Sile Municipal Domestic Water Assn.", + "Pena Blanca Water & Sanitation District": "Pena Blanca Water & Sanitation District", + "Town of Questa": "Town of Questa", + "Lamy MDWCA": "Lama MDWCA", + "Town of Cerro": "Town of Cerro", + "Farr Cattle Company": "Farr Cattle Company (Farr Ranch)", + "Carrizozo Orchard": "Carrizozo Orchard", + "USFS, Kiowa Grasslands": "USFS, Kiowa Grasslands", + "Cloud Country West Subdivision": "Cloud Country West Subdivision", + "Chama West Water Users Association": "Chama West Water Users Assn.", + "El Rito Regional Water and Waste Water Association": "El Rito Regional Water + Waste Water Association", + "West Rim MDWUA": "West Rim MDWUA", + "Village of Willard": "Village of Willard", + "Quemado Municipal Water & SWA": "Quemado Mutual Water and Sewage Works Association", + "Coyote Creek MDWUA": "Coyote Creek MDWUA", + "Lamy Mutual Domestic Water Assn.": "Lamy Mutual Domestic Water Assn.", + "La Joya CWDA": "La Joya CWDA", + "NM Firefighters Training Academy": "NM Firefighters Training Academy", + "Cebolleta Land Grant": "Cebolleta Land Grant", + "Madrid Water Co-op": "Madrid Water Co-op", + "Sun Valley Water and Sanitation": "Sun Valley Water and Sanitation", + "Bluewater Lake MDWCA": "Bluewater Lake MDWCA", + "Bluewater Acres Domestic WUA": "Bluewater Acres Domestic Water Users Assn.", + "Lybrook MDWCA": "Lybrook Municipal", + "New Mexico Museum of Natural History": "New Mexico Museum of Natural History", + "Hillsboro MDWCA": "Hillsboro Mutual Domestic Water Consumer Assn.", + "Tyrone MDWCA": "Tyrone Mutual Domestic Water Assn.", + "Santa Clara Water System": "Santa Clara Water System", + "Casas Adobes MDWCA": "Casas Adobes Mutual Domestic", + "Lake Roberts WUA": "Lake Roberts Water Assn.", + "El Creston MDWCA": "El Creston MDWCA", + "Reserve Municipality Water Works": "Reserve Municipality Water Works", + "Bayard": "Bayard Municipal Water", + "Town of Estancia": "Town of Estancia", + "Pie Town MDWCA": "Pie Town MDWCA", + "Roosevelt SWCD": "Roosevelt Soil & Water Conservation District", + "Otis MDWCA": "Otis Mutual Domestic", + "White Cliffs MDWUA": "White Cliffs MDWUA", + "Vista Linda Water Co-op": "Vista Linda Water Co-op", + "Anasazi Trails Water Co-op": "Anasazi Trails Water Cooperative", + "Canon MDWCA": "Canon Mutual Domestic Water Consumer Assn.", + "Placitas Trails Water Co-op": "Placitas Trails Water Coop", + "BLM, Roswell Office": "BLM, Roswell Office", + "Forked Lightning Ranch": "Forked Lightning Ranch", + "Cottonwood RWA": "Cottonwood Rural Water Assn.", + "Pinon Ridge WUA": "Pinon Ridge Water Users Association", + "McSherry Farms": "McSherry Farms", + "Agua Sana WUA": "Agua Sana Water Users Assn.", + "Chamita MDWCA": "Chamita Water Users Association", + "W Spear-bar Ranch": "W Spear-bar Ranch", + "Village of Capitan": "Village of Capitan", + "Brazos MDWCA": "Brazos Mutual Domestic Water Consumers Assn.", + "Alto Alps HOA": "Alto Alps Homeowners Association", + "Chiricahua Desert Museum": "Chiricahua Desert Museum", + "Bike Ranch": "Bike Ranch", + "Hachita MDWCA": "Hachita MDWCA", + "Carrizozo Municipal Water": "Carrizozo Municipal Water", + "Dunhill Ranch": "Dunhill Ranch", + "Santa Fe Conservation Trust": "Santa Fe Conservation Trust" +} From 1bb06ba0e0ad7301a2a8e4ef27ede37e21253e70 Mon Sep 17 00:00:00 2001 From: jross Date: Wed, 3 Dec 2025 14:35:55 -0700 Subject: [PATCH 62/66] feat: add organization mapping functionality and update contact transfer logic --- core/lexicon.json | 152 +++++++++++++++++++++++----------------------- 1 file changed, 76 insertions(+), 76 deletions(-) diff --git a/core/lexicon.json b/core/lexicon.json index 35f949802..8fca294be 100644 --- a/core/lexicon.json +++ b/core/lexicon.json @@ -373,82 +373,82 @@ {"categories": ["analysis_method_type"], "term": "Laboratory", "definition": "A procedure performed on a physical sample in a controlled, off-site laboratory environment. These methods typically involve complex instrumentation, standardized reagents, and formal quality control protocols."}, {"categories": ["analysis_method_type"], "term": "Field Procedure", "definition": "A standardized procedure performed on-site at the time of sample collection. This can involve direct measurement of the environmental medium using a calibrated field instrument or a specific, documented technique for collecting a sample."}, {"categories": ["analysis_method_type"], "term": "Calculation", "definition": "A mathematical procedure used to derive a new data point from one or more directly measured values. This type is used to document the provenance of calculated data, providing an auditable trail."}, - {"categories":["organization"],"term":"City of Aztec","definition":"City of Aztec"}, - {"categories":["organization"],"term":"Daybreak Investments","definition":"Daybreak Investments"}, - {"categories":["organization"],"term":"Vallecitos HOA","definition":"Vallecitos HOA"}, - {"categories":["organization"],"term":"Naiche Development","definition":"Naiche Corporation"}, - {"categories":["organization"],"term":"Santa Fe County; Santa Fe Animal Shelter","definition":"Santa Fe County; Santa Fe Animal Shelter"}, - {"categories":["organization"],"term":"El Guicu Ditch Association","definition":"El Guicu Ditch Association"}, - {"categories":["organization"],"term":"Santa Fe Municipal Airport","definition":"Santa Fe Municipal Airport"}, - {"categories":["organization"],"term":"Uluru Development","definition":"Uluru Development"}, - {"categories":["organization"],"term":"AllSup's Convenience Stores","definition":"AllSup's Convenience Stores"}, - {"categories":["organization"],"term":"Santa Fe Downs","definition":"Santa Fe Downs Resort"}, - {"categories":["organization"],"term":"City of Truth or Consequences, WWTP","definition":"City of Truth or Consequences, WWTP"}, - {"categories":["organization"],"term":"Riverbend Hotsprings","definition":"Riverbend Hotsprings"}, - {"categories":["organization"],"term":"Armendaris Ranch","definition":"Armendaris Ranch"}, - {"categories":["organization"],"term":"El Paso Water","definition":"El Paso Water"}, - {"categories":["organization"],"term":"BLM, Socorro Field Office","definition":"BLM, Socorro Field Office"}, - {"categories":["organization"],"term":"USFWS","definition":"US Fish & Wildlife Service"}, - {"categories":["organization"],"term":"NPS","definition":"National Park Service"}, - {"categories":["organization"],"term":"Sile MDWCA","definition":"Sile Municipal Domestic Water Assn."}, - {"categories":["organization"],"term":"Pena Blanca Water & Sanitation District","definition":"Pena Blanca Water & Sanitation District"}, - {"categories":["organization"],"term":"Town of Questa","definition":"Town of Questa"}, - {"categories":["organization"],"term":"Lamy MDWCA","definition":"Lama MDWCA"}, - {"categories":["organization"],"term":"Town of Cerro","definition":"Town of Cerro"}, - {"categories":["organization"],"term":"Farr Cattle Company","definition":"Farr Cattle Company (Farr Ranch)"}, - {"categories":["organization"],"term":"Carrizozo Orchard","definition":"Carrizozo Orchard"}, - {"categories":["organization"],"term":"USFS, Kiowa Grasslands","definition":"USFS, Kiowa Grasslands"}, - {"categories":["organization"],"term":"Cloud Country West Subdivision","definition":"Cloud Country West Subdivision"}, - {"categories":["organization"],"term":"Chama West Water Users Association","definition":"Chama West Water Users Assn."}, - {"categories":["organization"],"term":"El Rito Regional Water and Waste Water Association","definition":"El Rito Regional Water + Waste Water Association"}, - {"categories":["organization"],"term":"West Rim MDWUA","definition":"West Rim MDWUA"}, - {"categories":["organization"],"term":"Village of Willard","definition":"Village of Willard"}, - {"categories":["organization"],"term":"Quemado Municipal Water & SWA","definition":"Quemado Mutual Water and Sewage Works Association"}, - {"categories":["organization"],"term":"Coyote Creek MDWUA","definition":"Coyote Creek MDWUA"}, - {"categories":["organization"],"term":"Lamy MDWCA","definition":"Lamy Mutual Domestic Water Assn."}, - {"categories":["organization"],"term":"La Joya CWDA","definition":"La Joya CWDA"}, - {"categories":["organization"],"term":"NM Firefighters Training Academy","definition":"NM Firefighters Training Academy"}, - {"categories":["organization"],"term":"Cebolleta Land Grant","definition":"Cebolleta Land Grant"}, - {"categories":["organization"],"term":"Madrid Water Co-op","definition":"Madrid Water Co-op"}, - {"categories":["organization"],"term":"Sun Valley Water and Sanitation","definition":"Sun Valley Water and Sanitation"}, - {"categories":["organization"],"term":"Bluewater Lake MDWCA","definition":"Bluewater Lake MDWCA"}, - {"categories":["organization"],"term":"Bluewater Acres Domestic WUA","definition":"Bluewater Acres Domestic Water Users Assn."}, - {"categories":["organization"],"term":"Lybrook MDWCA","definition":"Lybrook Municipal"}, - {"categories":["organization"],"term":"New Mexico Museum of Natural History","definition":"New Mexico Museum of Natural History"}, - {"categories":["organization"],"term":"Hillsboro MDWCA","definition":"Hillsboro Mutual Domestic Water Consumer Assn."}, - {"categories":["organization"],"term":"Tyrone MDWCA","definition":"Tyrone Mutual Domestic Water Assn."}, - {"categories":["organization"],"term":"Santa Clara Water System","definition":"Santa Clara Water System"}, - {"categories":["organization"],"term":"Casas Adobes MDWCA","definition":"Casas Adobes Mutual Domestic"}, - {"categories":["organization"],"term":"Lake Roberts WUA","definition":"Lake Roberts Water Assn."}, - {"categories":["organization"],"term":"El Creston MDWCA","definition":"El Creston MDWCA"}, - {"categories":["organization"],"term":"Reserve Municipality Water Works","definition":"Reserve Municipality Water Works"}, - {"categories":["organization"],"term":"Bayard","definition":"Bayard Municipal Water"}, - {"categories":["organization"],"term":"Town of Estancia","definition":"Town of Estancia"}, - {"categories":["organization"],"term":"Pie Town MDWCA","definition":"Pie Town MDWCA"}, - {"categories":["organization"],"term":"Roosevelt SWCD","definition":"Roosevelt Soil & Water Conservation District"}, - {"categories":["organization"],"term":"Otis MDWCA","definition":"Otis Mutual Domestic"}, - {"categories":["organization"],"term":"White Cliffs MDWUA","definition":"White Cliffs MDWUA"}, - {"categories":["organization"],"term":"Vista Linda Water Co-op","definition":"Vista Linda Water Co-op"}, - {"categories":["organization"],"term":"Anasazi Trails Water Co-op","definition":"Anasazi Trails Water Cooperative"}, - {"categories":["organization"],"term":"Canon MDWCA","definition":"Canon Mutual Domestic Water Consumer Assn."}, - {"categories":["organization"],"term":"Placitas Trails Water Co-op","definition":"Placitas Trails Water Coop"}, - {"categories":["organization"],"term":"BLM, Roswell Office","definition":"BLM, Roswell Office"}, - {"categories":["organization"],"term":"Forked Lightning Ranch","definition":"Forked Lightning Ranch"}, - {"categories":["organization"],"term":"Cottonwood RWA","definition":"Cottonwood Rural Water Assn."}, - {"categories":["organization"],"term":"Pinon Ridge WUA","definition":"Pinon Ridge Water Users Association"}, - {"categories":["organization"],"term":"McSherry Farms","definition":"McSherry Farms"}, - {"categories":["organization"],"term":"Agua Sana WUA","definition":"Agua Sana Water Users Assn."}, - {"categories":["organization"],"term":"Chamita MDWCA","definition":"Chamita Water Users Association"}, - {"categories":["organization"],"term":"W Spear-bar Ranch","definition":"W Spear-bar Ranch"}, - {"categories":["organization"],"term":"Village of Capitan","definition":"Village of Capitan"}, - {"categories":["organization"],"term":"Brazos MDWCA","definition":"Brazos Mutual Domestic Water Consumers Assn."}, - {"categories":["organization"],"term":"Alto Alps HOA","definition":"Alto Alps Homeowners Association"}, - {"categories":["organization"],"term":"Chiricahua Desert Museum","definition":"Chiricahua Desert Museum"}, - {"categories":["organization"],"term":"Bike Ranch","definition":"Bike Ranch"}, - {"categories":["organization"],"term":"Hachita MDWCA","definition":"Hachita MDWCA"}, - {"categories":["organization"],"term":"Carrizozo Municipal Water","definition":"Carrizozo Municipal Water"}, - {"categories":["organization"],"term":"Dunhill Ranch","definition":"Dunhill Ranch"}, - {"categories":["organization"],"term":"Santa Fe Conservation Trust","definition":"Santa Fe Conservation Trust"}, + {"categories": ["organization"], "term": "City of Aztec", "definition": "City of Aztec"}, + {"categories": ["organization"], "term": "Daybreak Investments", "definition": "Daybreak Investments"}, + {"categories": ["organization"], "term": "Vallecitos HOA", "definition": "Vallecitos HOA"}, + {"categories": ["organization"], "term": "Naiche Development", "definition": "Naiche Corporation"}, + {"categories": ["organization"], "term": "Santa Fe County; Santa Fe Animal Shelter", "definition": "Santa Fe County; Santa Fe Animal Shelter"}, + {"categories": ["organization"], "term": "El Guicu Ditch Association", "definition": "El Guicu Ditch Association"}, + {"categories": ["organization"], "term": "Santa Fe Municipal Airport", "definition": "Santa Fe Municipal Airport"}, + {"categories": ["organization"], "term": "Uluru Development", "definition": "Uluru Development"}, + {"categories": ["organization"], "term": "AllSup's Convenience Stores", "definition": "AllSup's Convenience Stores"}, + {"categories": ["organization"], "term": "Santa Fe Downs", "definition": "Santa Fe Downs Resort"}, + {"categories": ["organization"], "term": "City of Truth or Consequences, WWTP", "definition": "City of Truth or Consequences, WWTP"}, + {"categories": ["organization"], "term": "Riverbend Hotsprings", "definition": "Riverbend Hotsprings"}, + {"categories": ["organization"], "term": "Armendaris Ranch", "definition": "Armendaris Ranch"}, + {"categories": ["organization"], "term": "El Paso Water", "definition": "El Paso Water"}, + {"categories": ["organization"], "term": "BLM, Socorro Field Office", "definition": "BLM, Socorro Field Office"}, + {"categories": ["organization"], "term": "USFWS", "definition": "US Fish & Wildlife Service"}, + {"categories": ["organization"], "term": "NPS", "definition": "National Park Service"}, + {"categories": ["organization"], "term": "Sile MDWCA", "definition": "Sile Municipal Domestic Water Assn."}, + {"categories": ["organization"], "term": "Pena Blanca Water & Sanitation District", "definition": "Pena Blanca Water & Sanitation District"}, + {"categories": ["organization"], "term": "Town of Questa", "definition": "Town of Questa"}, + {"categories": ["organization"], "term": "Lamy MDWCA", "definition": "Lama MDWCA"}, + {"categories": ["organization"], "term": "Town of Cerro", "definition": "Town of Cerro"}, + {"categories": ["organization"], "term": "Farr Cattle Company", "definition": "Farr Cattle Company (Farr Ranch)"}, + {"categories": ["organization"], "term": "Carrizozo Orchard", "definition": "Carrizozo Orchard"}, + {"categories": ["organization"], "term": "USFS, Kiowa Grasslands", "definition": "USFS, Kiowa Grasslands"}, + {"categories": ["organization"], "term": "Cloud Country West Subdivision", "definition": "Cloud Country West Subdivision"}, + {"categories": ["organization"], "term": "Chama West Water Users Association", "definition": "Chama West Water Users Assn."}, + {"categories": ["organization"], "term": "El Rito Regional Water and Waste Water Association", "definition": "El Rito Regional Water + Waste Water Association"}, + {"categories": ["organization"], "term": "West Rim MDWUA", "definition": "West Rim MDWUA"}, + {"categories": ["organization"], "term": "Village of Willard", "definition": "Village of Willard"}, + {"categories": ["organization"], "term": "Quemado Municipal Water & SWA", "definition": "Quemado Mutual Water and Sewage Works Association"}, + {"categories": ["organization"], "term": "Coyote Creek MDWUA", "definition": "Coyote Creek MDWUA"}, + {"categories": ["organization"], "term": "Lamy MDWCA", "definition": "Lamy Mutual Domestic Water Assn."}, + {"categories": ["organization"], "term": "La Joya CWDA", "definition": "La Joya CWDA"}, + {"categories": ["organization"], "term": "NM Firefighters Training Academy", "definition": "NM Firefighters Training Academy"}, + {"categories": ["organization"], "term": "Cebolleta Land Grant", "definition": "Cebolleta Land Grant"}, + {"categories": ["organization"], "term": "Madrid Water Co-op", "definition": "Madrid Water Co-op"}, + {"categories": ["organization"], "term": "Sun Valley Water and Sanitation", "definition": "Sun Valley Water and Sanitation"}, + {"categories": ["organization"], "term": "Bluewater Lake MDWCA", "definition": "Bluewater Lake MDWCA"}, + {"categories": ["organization"], "term": "Bluewater Acres Domestic WUA", "definition": "Bluewater Acres Domestic Water Users Assn."}, + {"categories": ["organization"], "term": "Lybrook MDWCA", "definition": "Lybrook Municipal"}, + {"categories": ["organization"], "term": "New Mexico Museum of Natural History", "definition": "New Mexico Museum of Natural History"}, + {"categories": ["organization"], "term": "Hillsboro MDWCA", "definition": "Hillsboro Mutual Domestic Water Consumer Assn."}, + {"categories": ["organization"], "term": "Tyrone MDWCA", "definition": "Tyrone Mutual Domestic Water Assn."}, + {"categories": ["organization"], "term": "Santa Clara Water System", "definition": "Santa Clara Water System"}, + {"categories": ["organization"], "term": "Casas Adobes MDWCA", "definition": "Casas Adobes Mutual Domestic"}, + {"categories": ["organization"], "term": "Lake Roberts WUA", "definition": "Lake Roberts Water Assn."}, + {"categories": ["organization"], "term": "El Creston MDWCA", "definition": "El Creston MDWCA"}, + {"categories": ["organization"], "term": "Reserve Municipality Water Works", "definition": "Reserve Municipality Water Works"}, + {"categories": ["organization"], "term": "Bayard", "definition": "Bayard Municipal Water"}, + {"categories": ["organization"], "term": "Town of Estancia", "definition": "Town of Estancia"}, + {"categories": ["organization"], "term": "Pie Town MDWCA", "definition": "Pie Town MDWCA"}, + {"categories": ["organization"], "term": "Roosevelt SWCD", "definition": "Roosevelt Soil & Water Conservation District"}, + {"categories": ["organization"], "term": "Otis MDWCA", "definition": "Otis Mutual Domestic"}, + {"categories": ["organization"], "term": "White Cliffs MDWUA", "definition": "White Cliffs MDWUA"}, + {"categories": ["organization"], "term": "Vista Linda Water Co-op", "definition": "Vista Linda Water Co-op"}, + {"categories": ["organization"], "term": "Anasazi Trails Water Co-op", "definition": "Anasazi Trails Water Cooperative"}, + {"categories": ["organization"], "term": "Canon MDWCA", "definition": "Canon Mutual Domestic Water Consumer Assn."}, + {"categories": ["organization"], "term": "Placitas Trails Water Co-op", "definition": "Placitas Trails Water Coop"}, + {"categories": ["organization"], "term": "BLM, Roswell Office", "definition": "BLM, Roswell Office"}, + {"categories": ["organization"], "term": "Forked Lightning Ranch", "definition": "Forked Lightning Ranch"}, + {"categories": ["organization"], "term": "Cottonwood RWA", "definition": "Cottonwood Rural Water Assn."}, + {"categories": ["organization"], "term": "Pinon Ridge WUA", "definition": "Pinon Ridge Water Users Association"}, + {"categories": ["organization"], "term": "McSherry Farms", "definition": "McSherry Farms"}, + {"categories": ["organization"], "term": "Agua Sana WUA", "definition": "Agua Sana Water Users Assn."}, + {"categories": ["organization"], "term": "Chamita MDWCA", "definition": "Chamita Water Users Association"}, + {"categories": ["organization"], "term": "W Spear-bar Ranch", "definition": "W Spear-bar Ranch"}, + {"categories": ["organization"], "term": "Village of Capitan", "definition": "Village of Capitan"}, + {"categories": ["organization"], "term": "Brazos MDWCA", "definition": "Brazos Mutual Domestic Water Consumers Assn."}, + {"categories": ["organization"], "term": "Alto Alps HOA", "definition": "Alto Alps Homeowners Association"}, + {"categories": ["organization"], "term": "Chiricahua Desert Museum", "definition": "Chiricahua Desert Museum"}, + {"categories": ["organization"], "term": "Bike Ranch", "definition": "Bike Ranch"}, + {"categories": ["organization"], "term": "Hachita MDWCA", "definition": "Hachita MDWCA"}, + {"categories": ["organization"], "term": "Carrizozo Municipal Water", "definition": "Carrizozo Municipal Water"}, + {"categories": ["organization"], "term": "Dunhill Ranch", "definition": "Dunhill Ranch"}, + {"categories": ["organization"], "term": "Santa Fe Conservation Trust", "definition": "Santa Fe Conservation Trust"}, {"categories": ["organization"], "term": "NMSU", "definition": "New Mexico State University"}, {"categories": ["organization"], "term": "USGS", "definition": "US Geological Survey"}, {"categories": ["organization"], "term": "TWDB", "definition": "Texas Water Development Board"}, From 650203243ebe20a0933c7793844879942c338a98 Mon Sep 17 00:00:00 2001 From: jross Date: Wed, 3 Dec 2025 15:11:42 -0700 Subject: [PATCH 63/66] feat: add DiverLink and Diver Cable to sensor mapping --- transfers/sensor_transfer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/transfers/sensor_transfer.py b/transfers/sensor_transfer.py index 76f9f4fe9..2f4ce7cf3 100644 --- a/transfers/sensor_transfer.py +++ b/transfers/sensor_transfer.py @@ -33,6 +33,8 @@ "Pressure transducer": "Pressure Transducer", "Acoustic sounder": "Acoustic Sounder", "Barometer": "Barometer", + "DiverLink": "DiverLink", + "Diver Cable": "Diver Cable", } From d8a16784299121229e42329993602530dae17e9a Mon Sep 17 00:00:00 2001 From: jross Date: Wed, 3 Dec 2025 15:14:56 -0700 Subject: [PATCH 64/66] feat: remove Farr Cattle Company from organization lexicon --- core/lexicon.json | 1 - 1 file changed, 1 deletion(-) diff --git a/core/lexicon.json b/core/lexicon.json index 8fca294be..f7a4f381f 100644 --- a/core/lexicon.json +++ b/core/lexicon.json @@ -551,7 +551,6 @@ {"categories": ["organization"], "term": "El Rito Canyon MDWCA", "definition": "El Rito Canyon MDWCA"}, {"categories": ["organization"], "term": "Encantado Enterprises", "definition": "Encantado Enterprises"}, {"categories": ["organization"], "term": "Estrella Concepts LLC", "definition": "Estrella Concepts LLC"}, - {"categories": ["organization"], "term": "Farr Cattle Company", "definition": "Farr Cattle Company"}, {"categories": ["organization"], "term": "Sixteen Springs Fire Department", "definition": "Sixteen Springs Fire Department"}, {"categories": ["organization"], "term": "Fire Water Lodge", "definition": "Fire Water Lodge"}, {"categories": ["organization"], "term": "Ford County Land & Cattle Company, Inc", "definition": "Ford County Land & Cattle Company, Inc"}, From b2674058240ff82e90467bca4a5a04717549c6a0 Mon Sep 17 00:00:00 2001 From: jross Date: Wed, 3 Dec 2025 15:17:21 -0700 Subject: [PATCH 65/66] feat: remove Lamy MDWCA from organization lexicon --- core/lexicon.json | 1 - 1 file changed, 1 deletion(-) diff --git a/core/lexicon.json b/core/lexicon.json index f7a4f381f..e5048eb49 100644 --- a/core/lexicon.json +++ b/core/lexicon.json @@ -564,7 +564,6 @@ {"categories": ["organization"], "term": "K. Schmitt Trust", "definition": "K. Schmitt Trust"}, {"categories": ["organization"], "term": "La Cienega MDWCA", "definition": "La Cienega MDWCA"}, {"categories": ["organization"], "term": "La Vista HOA", "definition": "La Vista HOA"}, - {"categories": ["organization"], "term": "Lamy MDWCA", "definition": "Lamy MDWCA"}, {"categories": ["organization"], "term": "Land Ventures LLC", "definition": "Land Ventures LLC"}, {"categories": ["organization"], "term": "Las Lagunitas", "definition": "Las Lagunitas"}, {"categories": ["organization"], "term": "Las Lagunitas HOA", "definition": "Las Lagunitas HOA"}, From 147db27258975c5c6cd158f750b291800bd49cc2 Mon Sep 17 00:00:00 2001 From: jross Date: Wed, 3 Dec 2025 15:21:35 -0700 Subject: [PATCH 66/66] feat: remove Santa Fe Downs from organization lexicon --- core/lexicon.json | 1 - 1 file changed, 1 deletion(-) diff --git a/core/lexicon.json b/core/lexicon.json index e5048eb49..423be5332 100644 --- a/core/lexicon.json +++ b/core/lexicon.json @@ -605,7 +605,6 @@ {"categories": ["organization"], "term": "Sangre de Cristo Center", "definition": "Sangre de Cristo Center"}, {"categories": ["organization"], "term": "Valle Vista Water Utility", "definition": "Valle Vista Water Utility"}, {"categories": ["organization"], "term": "Santa Fe County, Valle Vista Water Utility, Inc.", "definition": "Santa Fe County, Valle Vista Water Utility, Inc."}, - {"categories": ["organization"], "term": "Santa Fe Downs", "definition": "Santa Fe Downs"}, {"categories": ["organization"], "term": "Santa Fe Horse Park", "definition": "Santa Fe Horse Park"}, {"categories": ["organization"], "term": "Santa Fe Opera", "definition": "Santa Fe Opera"}, {"categories": ["organization"], "term": "Santa Fe Waldorf School", "definition": "Santa Fe Waldorf School"},