From 50054e43aea7534549e2dae28db641cb003b4d29 Mon Sep 17 00:00:00 2001 From: Yolanda Robla Date: Fri, 7 Feb 2025 16:24:01 +0100 Subject: [PATCH 1/3] feat: remove duplicated alerts Sometimes the different client tools generate multiple requests when the user requests a task. This generates what it looks as a duplicate alert, but is not really a total duplicate as it belongs to different request. But for the user does not provide so much value to have it, so proceed with deduplicating those alerts, based on the code snippet and details of the alert Also remove dogecoin regex as it's giving false positives Closes: #875 --- signatures.yaml | 1 - src/codegate/api/v1.py | 1 + src/codegate/api/v1_processing.py | 38 +++++++++++++++++++++++++++++-- 3 files changed, 37 insertions(+), 3 deletions(-) diff --git a/signatures.yaml b/signatures.yaml index 881c3bd3..ccab615a 100644 --- a/signatures.yaml +++ b/signatures.yaml @@ -281,7 +281,6 @@ - Bitcoin SegWit: \b(bc1)[a-zA-HJ-NP-Z0-9]{39,59}\b - Ethereum: \b0x[a-fA-F0-9]{40}\b - Litecoin: \b(L|M)[a-km-zA-HJ-NP-Z1-9]{26,33}\b - - Dogecoin: \b(D|A)[a-km-zA-HJ-NP-Z1-9]{25,34}\b - Ripple: \br[rK][a-zA-Z0-9]{25,35}\b - Monero: \b4[0-9AB][1-9A-HJ-NP-Za-km-z]{93}\b - Tron: \bT[a-zA-HJ-NP-Z0-9]{33}\b diff --git a/src/codegate/api/v1.py b/src/codegate/api/v1.py index 3b837a8b..9ba9c93a 100644 --- a/src/codegate/api/v1.py +++ b/src/codegate/api/v1.py @@ -390,6 +390,7 @@ async def get_workspace_alerts(workspace_name: str) -> List[Optional[v1_models.A try: alerts = await dbreader.get_alerts_by_workspace(ws.id, AlertSeverity.CRITICAL.value) + alerts = v1_processing.remove_duplicate_alerts(alerts) prompts_outputs = await dbreader.get_prompts_with_output(ws.id) return await v1_processing.parse_get_alert_conversation(alerts, prompts_outputs) except Exception: diff --git a/src/codegate/api/v1_processing.py b/src/codegate/api/v1_processing.py index c72e06ba..9ce3a17e 100644 --- a/src/codegate/api/v1_processing.py +++ b/src/codegate/api/v1_processing.py @@ -62,7 +62,7 @@ async def _is_system_prompt(message: str) -> bool: return False -async def parse_request(request_str: str) -> Tuple[Optional[List[str]], str]: +async def parse_request(request_str: str) -> Tuple[Optional[List[str]], str]: # noqa: C901 """ Parse the request string from the pipeline and return the message and the model. """ @@ -105,7 +105,7 @@ async def parse_request(request_str: str) -> Tuple[Optional[List[str]], str]: return messages, model -async def parse_output(output_str: str) -> Optional[str]: +async def parse_output(output_str: str) -> Optional[str]: # noqa: C901 """ Parse the output string from the pipeline and return the message. """ @@ -499,3 +499,37 @@ async def parse_workspace_token_usage( for p_qa in partial_question_answers: token_usage_agg.add_model_token_usage(p_qa.model_token_usage) return token_usage_agg + + +def remove_duplicate_alerts(alerts): + unique_alerts = [] + seen = defaultdict(list) + + for alert in sorted( + alerts, key=lambda x: x.timestamp, reverse=True + ): # Sort alerts by timestamp descending + if alert.trigger_type != "codegate-secrets": + unique_alerts.append(alert) + continue + + # Extract trigger string content until "Context" + trigger_string_content = alert.trigger_string.split("Context")[0] + + key = ( + alert.code_snippet, + alert.trigger_type, + alert.trigger_category, + trigger_string_content, + ) + + # If key exists and new alert is more recent, replace it + if key in seen: + existing_alert = seen[key] + if abs((alert.timestamp - existing_alert.timestamp).total_seconds()) < 5: + seen[key] = alert # Replace with newer alert + continue + + seen[key] = alert + unique_alerts.append(alert) + + return list(seen.values()) From bf01c293c418e9b531c1b7b9443983a05c50d9f1 Mon Sep 17 00:00:00 2001 From: Alejandro Ponce Date: Wed, 12 Feb 2025 16:02:34 +0200 Subject: [PATCH 2/3] Moved function to remove duplicate alerts to v1_processing --- src/codegate/api/v1.py | 1 - src/codegate/api/v1_processing.py | 8 +++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/codegate/api/v1.py b/src/codegate/api/v1.py index 9ba9c93a..3b837a8b 100644 --- a/src/codegate/api/v1.py +++ b/src/codegate/api/v1.py @@ -390,7 +390,6 @@ async def get_workspace_alerts(workspace_name: str) -> List[Optional[v1_models.A try: alerts = await dbreader.get_alerts_by_workspace(ws.id, AlertSeverity.CRITICAL.value) - alerts = v1_processing.remove_duplicate_alerts(alerts) prompts_outputs = await dbreader.get_prompts_with_output(ws.id) return await v1_processing.parse_get_alert_conversation(alerts, prompts_outputs) except Exception: diff --git a/src/codegate/api/v1_processing.py b/src/codegate/api/v1_processing.py index 9ce3a17e..fc902d59 100644 --- a/src/codegate/api/v1_processing.py +++ b/src/codegate/api/v1_processing.py @@ -392,7 +392,8 @@ async def match_conversations( qa = _get_question_answer_from_partial(selected_partial_qa) qa.question.message = parse_question_answer(qa.question.message) questions_answers.append(qa) - alerts.extend(selected_partial_qa.alerts) + deduped_alerts = await remove_duplicate_alerts(selected_partial_qa.alerts) + alerts.extend(deduped_alerts) token_usage_agg.add_model_token_usage(selected_partial_qa.model_token_usage) # only add conversation if we have some answers @@ -480,10 +481,11 @@ async def parse_get_alert_conversation( The rows contain the raw request and output strings from the pipeline. """ _, map_q_id_to_conversation = await parse_messages_in_conversations(prompts_outputs) + dedup_alerts = await remove_duplicate_alerts(alerts) async with asyncio.TaskGroup() as tg: tasks = [ tg.create_task(parse_row_alert_conversation(row, map_q_id_to_conversation)) - for row in alerts + for row in dedup_alerts ] return [task.result() for task in tasks if task.result() is not None] @@ -501,7 +503,7 @@ async def parse_workspace_token_usage( return token_usage_agg -def remove_duplicate_alerts(alerts): +async def remove_duplicate_alerts(alerts: List[v1_models.Alert]) -> List[v1_models.Alert]: unique_alerts = [] seen = defaultdict(list) From ada3ed72236e9a22b8f91e99417f70de16f7c5a8 Mon Sep 17 00:00:00 2001 From: Alejandro Ponce Date: Wed, 12 Feb 2025 16:21:56 +0200 Subject: [PATCH 3/3] Restore dogecoin --- signatures.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/signatures.yaml b/signatures.yaml index ccab615a..881c3bd3 100644 --- a/signatures.yaml +++ b/signatures.yaml @@ -281,6 +281,7 @@ - Bitcoin SegWit: \b(bc1)[a-zA-HJ-NP-Z0-9]{39,59}\b - Ethereum: \b0x[a-fA-F0-9]{40}\b - Litecoin: \b(L|M)[a-km-zA-HJ-NP-Z1-9]{26,33}\b + - Dogecoin: \b(D|A)[a-km-zA-HJ-NP-Z1-9]{25,34}\b - Ripple: \br[rK][a-zA-Z0-9]{25,35}\b - Monero: \b4[0-9AB][1-9A-HJ-NP-Za-km-z]{93}\b - Tron: \bT[a-zA-HJ-NP-Z0-9]{33}\b