From f6c492dca47d3e57d940b8065fcb74def95cea3f Mon Sep 17 00:00:00 2001 From: Anders Starcke Henriksen Date: Wed, 2 Aug 2023 12:58:56 +0200 Subject: [PATCH 1/4] Use filtering queries to do batched AI quering. --- .../auto-model-codeml-queries.ts | 75 ++++++++++++- .../data-extensions-editor/auto-model-v2.ts | 57 ++++++++++ .../data-extensions-editor-view.ts | 44 +++++++- .../auto-model-v2.test.ts | 106 ++++++++++++++++++ .../auto-model-codeml-queries.test.ts | 48 +++++++- 5 files changed, 321 insertions(+), 9 deletions(-) diff --git a/extensions/ql-vscode/src/data-extensions-editor/auto-model-codeml-queries.ts b/extensions/ql-vscode/src/data-extensions-editor/auto-model-codeml-queries.ts index ca42902bab1..9e6909e66c7 100644 --- a/extensions/ql-vscode/src/data-extensions-editor/auto-model-codeml-queries.ts +++ b/extensions/ql-vscode/src/data-extensions-editor/auto-model-codeml-queries.ts @@ -17,6 +17,10 @@ import { redactableError } from "../common/errors"; import { interpretResultsSarif } from "../query-results"; import { join } from "path"; import { assertNever } from "../common/helpers-pure"; +import { dir } from "tmp-promise"; +import { writeFile, outputFile } from "fs-extra"; +import { dump as dumpYaml } from "js-yaml"; +import { MethodSignature } from "./external-api-usage"; type AutoModelQueryOptions = { queryTag: string; @@ -26,6 +30,7 @@ type AutoModelQueryOptions = { databaseItem: DatabaseItem; qlpack: QlPacksForLanguage; sourceInfo: SourceInfo | undefined; + additionalPacks: string[]; extensionPacks: string[]; queryStorageDir: string; @@ -52,6 +57,7 @@ async function runAutoModelQuery({ databaseItem, qlpack, sourceInfo, + additionalPacks, extensionPacks, queryStorageDir, progress, @@ -99,7 +105,7 @@ async function runAutoModelQuery({ quickEvalCountOnly: false, }, false, - getOnDiskWorkspaceFolders(), + additionalPacks, extensionPacks, queryStorageDir, undefined, @@ -147,6 +153,7 @@ async function runAutoModelQuery({ type AutoModelQueriesOptions = { mode: Mode; + candidateMethods: MethodSignature[]; cliServer: CodeQLCliServer; queryRunner: QueryRunner; databaseItem: DatabaseItem; @@ -161,6 +168,7 @@ export type AutoModelQueriesResult = { export async function runAutoModelQueries({ mode, + candidateMethods, cliServer, queryRunner, databaseItem, @@ -189,7 +197,13 @@ export async function runAutoModelQueries({ sourceLocationPrefix, }; - const additionalPacks = getOnDiskWorkspaceFolders(); + // Generate a pack containing the candidate filters + const filterPackDir = await generateCandidateFilterPack( + databaseItem.language, + candidateMethods, + ); + + const additionalPacks = [...getOnDiskWorkspaceFolders(), filterPackDir]; const extensionPacks = Object.keys( await cliServer.resolveQlpacks(additionalPacks, true), ); @@ -208,6 +222,7 @@ export async function runAutoModelQueries({ databaseItem, qlpack, sourceInfo, + additionalPacks, extensionPacks, queryStorageDir, progress: (update) => { @@ -228,3 +243,59 @@ export async function runAutoModelQueries({ candidates, }; } + +/** + * generateCandidateFilterPack will create a temporary extension pack. + * This pack will contain a filter that will restrict the automodel queries + * to the specified candidate methods only. + * This is done using the `extensible` predicate "automodelCandidateFilter". + * @param language + * @param candidateMethods + * @returns + */ +export async function generateCandidateFilterPack( + language: string, + candidateMethods: MethodSignature[], +): Promise { + // Pack resides in a temporary directory, to not pollute the workspace. + const packDir = (await dir({ unsafeCleanup: true })).path; + + const syntheticConfigPack = { + name: "codeql/automodel-filter", + version: "0.0.0", + library: true, + extensionTargets: { + [`codeql/${language}-all`]: "*", + }, + dataExtensions: ["filter.yml"], + }; + + const qlpackFile = join(packDir, "codeql-pack.yml"); + await outputFile(qlpackFile, dumpYaml(syntheticConfigPack), "utf8"); + + // The predicate has the following defintion: + // extensible predicate automodelCandidateFilter(string package, string type, string name, string signature) + const dataRows = candidateMethods.map((method) => [ + method.packageName, + method.typeName, + method.methodName, + method.methodParameters, + ]); + + const filter = { + extensions: [ + { + addsTo: { + pack: `codeql/${language}-queries`, + extensible: "automodelCandidateFilter", + }, + data: dataRows, + }, + ], + }; + + const filterFile = join(packDir, "filter.yml"); + await writeFile(filterFile, dumpYaml(filter), "utf8"); + + return packDir; +} diff --git a/extensions/ql-vscode/src/data-extensions-editor/auto-model-v2.ts b/extensions/ql-vscode/src/data-extensions-editor/auto-model-v2.ts index c3522acc9d4..1dd09c2ca34 100644 --- a/extensions/ql-vscode/src/data-extensions-editor/auto-model-v2.ts +++ b/extensions/ql-vscode/src/data-extensions-editor/auto-model-v2.ts @@ -4,6 +4,63 @@ import { AutoModelQueriesResult } from "./auto-model-codeml-queries"; import { assertNever } from "../common/helpers-pure"; import * as Sarif from "sarif"; import { gzipEncode } from "../common/zlib"; +import { ExternalApiUsage, MethodSignature } from "./external-api-usage"; +import { ModeledMethod } from "./modeled-method"; +import { groupMethods, sortGroupNames, sortMethods } from "./shared/sorting"; + +// Soft limit on the number of candidates to send to the model. +// Note that the model may return fewer than this number of candidates. +const candidateLimit = 20; +/** + * Return the candidates that the model should be run on. This includes limiting the number of + * candidates to the candidate limit and filtering out anything that is already modeled and respecting + * the order in the UI. + * @param mode Whether it is application or framework mode. + * @param externalApiUsages all external API usages. + * @param modeledMethods the currently modeled methods. + * @returns list of modeled methods that are candidates for modeling. + */ +export function getCandidates( + mode: Mode, + externalApiUsages: ExternalApiUsage[], + modeledMethods: Record, +): MethodSignature[] { + // Sort the same way as the UI so we send the first ones listed in the UI first + const grouped = groupMethods(externalApiUsages, mode); + const sortedGroupNames = sortGroupNames(grouped); + const sortedExternalApiUsages = sortedGroupNames.flatMap((name) => + sortMethods(grouped[name]), + ); + + const candidates: MethodSignature[] = []; + + for (const externalApiUsage of sortedExternalApiUsages) { + const modeledMethod: ModeledMethod = modeledMethods[ + externalApiUsage.signature + ] ?? { + type: "none", + }; + + // If we have reached the max number of candidates then stop + if (candidates.length >= candidateLimit) { + break; + } + + // Anything that is modeled is not a candidate + if (modeledMethod.type !== "none") { + continue; + } + + // A method that is supported is modeled outside of the model file, so it is not a candidate. + if (externalApiUsage.supported) { + continue; + } + + // The rest are candidates + candidates.push(externalApiUsage); + } + return candidates; +} /** * Encode a SARIF log to the format expected by the server: JSON, GZIP-compressed, base64-encoded diff --git a/extensions/ql-vscode/src/data-extensions-editor/data-extensions-editor-view.ts b/extensions/ql-vscode/src/data-extensions-editor/data-extensions-editor-view.ts index 888893c94ca..17afe729f87 100644 --- a/extensions/ql-vscode/src/data-extensions-editor/data-extensions-editor-view.ts +++ b/extensions/ql-vscode/src/data-extensions-editor/data-extensions-editor-view.ts @@ -56,9 +56,10 @@ import { join } from "path"; import { pickExtensionPack } from "./extension-pack-picker"; import { getLanguageDisplayName } from "../common/query-language"; import { runAutoModelQueries } from "./auto-model-codeml-queries"; -import { createAutoModelV2Request } from "./auto-model-v2"; +import { createAutoModelV2Request, getCandidates } from "./auto-model-v2"; import { load as loadYaml } from "js-yaml"; import { loadDataExtensionYaml } from "./yaml"; +import { extLogger } from "../common/logging/vscode"; export class DataExtensionsEditorView extends AbstractWebview< ToDataExtensionsEditorMessage, @@ -380,8 +381,22 @@ export class DataExtensionsEditorView extends AbstractWebview< let predictedModeledMethods: Record; if (useLlmGenerationV2()) { + // Fetch the candidates to send to the model + const candidateMethods = getCandidates( + this.mode, + externalApiUsages, + modeledMethods, + ); + + // If there are no candidates, there is nothing to model and we just return + if (candidateMethods.length === 0) { + void extLogger.log("No candidates to model. Stopping."); + return; + } + const usages = await runAutoModelQueries({ mode: this.mode, + candidateMethods, cliServer: this.cliServer, queryRunner: this.queryRunner, queryStorageDir: this.queryStorageDir, @@ -421,12 +436,33 @@ export class DataExtensionsEditorView extends AbstractWebview< filename: "auto-model.yml", }); - const modeledMethods = loadDataExtensionYaml(models); - if (!modeledMethods) { + const loadedMethods = loadDataExtensionYaml(models); + if (!loadedMethods) { return; } - predictedModeledMethods = modeledMethods; + // Any candidate that was part of the response is a negative result + // meaning that the canidate is not a sink for the kinds that the LLM is checking for. + // For now we model this as a sink neutral method, however this is subject + // to discussion. + for (const candidate of candidateMethods) { + if (!(candidate.signature in loadedMethods)) { + loadedMethods[candidate.signature] = { + type: "neutral", + kind: "sink", + input: "", + output: "", + provenance: "ai-generated", + signature: candidate.signature, + packageName: candidate.packageName, + typeName: candidate.typeName, + methodName: candidate.methodName, + methodParameters: candidate.methodParameters, + }; + } + } + + predictedModeledMethods = loadedMethods; } else { const usages = await getAutoModelUsages({ cliServer: this.cliServer, diff --git a/extensions/ql-vscode/test/unit-tests/data-extensions-editor/auto-model-v2.test.ts b/extensions/ql-vscode/test/unit-tests/data-extensions-editor/auto-model-v2.test.ts index 545dbeeff97..acd0388b706 100644 --- a/extensions/ql-vscode/test/unit-tests/data-extensions-editor/auto-model-v2.test.ts +++ b/extensions/ql-vscode/test/unit-tests/data-extensions-editor/auto-model-v2.test.ts @@ -1,12 +1,15 @@ import { createAutoModelV2Request, encodeSarif, + getCandidates, } from "../../../src/data-extensions-editor/auto-model-v2"; import { Mode } from "../../../src/data-extensions-editor/shared/mode"; import { AutomodelMode } from "../../../src/data-extensions-editor/auto-model-api-v2"; import { AutoModelQueriesResult } from "../../../src/data-extensions-editor/auto-model-codeml-queries"; import * as sarif from "sarif"; import { gzipDecode } from "../../../src/common/zlib"; +import { ExternalApiUsage } from "../../../src/data-extensions-editor/external-api-usage"; +import { ModeledMethod } from "../../../src/data-extensions-editor/modeled-method"; describe("createAutoModelV2Request", () => { const createSarifLog = (queryId: string): sarif.Log => { @@ -80,3 +83,106 @@ describe("createAutoModelV2Request", () => { expect(parsed).toEqual(result.candidates); }); }); + +describe("getCandidates", () => { + it("doesnt return methods that are already modelled", () => { + const externalApiUsages: ExternalApiUsage[] = []; + externalApiUsages.push({ + library: "my.jar", + signature: "org.my.A#x()", + packageName: "org.my", + typeName: "A", + methodName: "x", + methodParameters: "()", + supported: false, + supportedType: "none", + usages: [], + }); + const modeledMethods: Record = { + "org.my.A#x()": { + type: "neutral", + kind: "", + input: "", + output: "", + provenance: "manual", + signature: "org.my.A#x()", + packageName: "org.my", + typeName: "A", + methodName: "x", + methodParameters: "()", + }, + }; + const candidates = getCandidates( + Mode.Application, + externalApiUsages, + modeledMethods, + ); + expect(candidates.length).toEqual(0); + }); + it("doesnt return methods that are supported from other sources", () => { + const externalApiUsages: ExternalApiUsage[] = []; + externalApiUsages.push({ + library: "my.jar", + signature: "org.my.A#x()", + packageName: "org.my", + typeName: "A", + methodName: "x", + methodParameters: "()", + supported: true, + supportedType: "none", + usages: [], + }); + const modeledMethods = {}; + const candidates = getCandidates( + Mode.Application, + externalApiUsages, + modeledMethods, + ); + expect(candidates.length).toEqual(0); + }); + it("return methods that neither modeled nor supported from other sources", () => { + const externalApiUsages: ExternalApiUsage[] = []; + externalApiUsages.push({ + library: "my.jar", + signature: "org.my.A#x()", + packageName: "org.my", + typeName: "A", + methodName: "x", + methodParameters: "()", + supported: false, + supportedType: "none", + usages: [], + }); + const modeledMethods = {}; + const candidates = getCandidates( + Mode.Application, + externalApiUsages, + modeledMethods, + ); + expect(candidates.length).toEqual(1); + }); + it("respects the limit", () => { + const externalApiUsages: ExternalApiUsage[] = []; + for (let i = 0; i < 30; i++) { + externalApiUsages.push({ + library: "my.jar", + signature: `org.my.A#x${i}()`, + + packageName: "org.my", + typeName: "A", + methodName: `x${i}`, + methodParameters: "()", + supported: false, + supportedType: "none", + usages: [], + }); + } + const modeledMethods = {}; + const candidates = getCandidates( + Mode.Application, + externalApiUsages, + modeledMethods, + ); + expect(candidates.length).toEqual(20); + }); +}); diff --git a/extensions/ql-vscode/test/vscode-tests/no-workspace/data-extensions-editor/auto-model-codeml-queries.test.ts b/extensions/ql-vscode/test/vscode-tests/no-workspace/data-extensions-editor/auto-model-codeml-queries.test.ts index 6c6270dd6f6..8b11aa6f9e9 100644 --- a/extensions/ql-vscode/test/vscode-tests/no-workspace/data-extensions-editor/auto-model-codeml-queries.test.ts +++ b/extensions/ql-vscode/test/vscode-tests/no-workspace/data-extensions-editor/auto-model-codeml-queries.test.ts @@ -5,13 +5,20 @@ import { } from "../../../../src/databases/local-databases"; import { file } from "tmp-promise"; import { QueryResultType } from "../../../../src/query-server/new-messages"; -import { runAutoModelQueries } from "../../../../src/data-extensions-editor/auto-model-codeml-queries"; +import { + generateCandidateFilterPack, + runAutoModelQueries, +} from "../../../../src/data-extensions-editor/auto-model-codeml-queries"; import { Mode } from "../../../../src/data-extensions-editor/shared/mode"; import { mockedObject, mockedUri } from "../../utils/mocking.helpers"; import { CodeQLCliServer } from "../../../../src/codeql-cli/cli"; import { QueryRunner } from "../../../../src/query-server"; import * as queryResolver from "../../../../src/local-queries/query-resolver"; import * as standardQueries from "../../../../src/local-queries/standard-queries"; +import { MethodSignature } from "../../../../src/data-extensions-editor/external-api-usage"; +import { join } from "path"; +import { exists, readFile } from "fs-extra"; +import { load as loadYaml } from "js-yaml"; describe("runAutoModelQueries", () => { const qlpack = { @@ -60,6 +67,7 @@ describe("runAutoModelQueries", () => { const options = { mode: Mode.Application, + candidateMethods: [], cliServer: mockedObject({ resolveQlpacks: jest.fn().mockResolvedValue({ "/a/b/c/my-extension-pack": {}, @@ -140,7 +148,10 @@ describe("runAutoModelQueries", () => { expect(result).not.toBeUndefined(); expect(options.cliServer.resolveQlpacks).toHaveBeenCalledTimes(1); - expect(options.cliServer.resolveQlpacks).toHaveBeenCalledWith([], true); + expect(options.cliServer.resolveQlpacks).toHaveBeenCalledWith( + expect.arrayContaining([expect.stringContaining("tmp")]), + true, + ); expect(resolveQueriesSpy).toHaveBeenCalledTimes(1); expect(resolveQueriesSpy).toHaveBeenCalledWith( options.cliServer, @@ -165,7 +176,7 @@ describe("runAutoModelQueries", () => { quickEvalCountOnly: false, }, false, - [], + expect.arrayContaining([expect.stringContaining("tmp")]), ["/a/b/c/my-extension-pack"], "/tmp/queries", undefined, @@ -173,3 +184,34 @@ describe("runAutoModelQueries", () => { ); }); }); + +describe("generateCandidateFilterPack", () => { + it("should create a temp pack containing the candidate filters", async () => { + const candidateMethods: MethodSignature[] = [ + { + signature: "org.my.A#x()", + packageName: "org.my", + typeName: "A", + methodName: "x", + methodParameters: "()", + }, + ]; + const packDir = await generateCandidateFilterPack("java", candidateMethods); + expect(packDir).not.toBeUndefined(); + const qlpackFile = join(packDir, "codeql-pack.yml"); + expect(await exists(qlpackFile)).toBe(true); + const filterFile = join(packDir, "filter.yml"); + expect(await exists(filterFile)).toBe(true); + // Read the contents of filterFile and parse as yaml + const yaml = await loadYaml(await readFile(filterFile, "utf8")); + const extensions = yaml.extensions; + expect(extensions).toBeInstanceOf(Array); + expect(extensions).toHaveLength(1); + const extension = extensions[0]; + expect(extension.addsTo.pack).toEqual("codeql/java-queries"); + expect(extension.addsTo.extensible).toEqual("automodelCandidateFilter"); + expect(extension.data).toBeInstanceOf(Array); + expect(extension.data).toHaveLength(1); + expect(extension.data[0]).toEqual(["org.my", "A", "x", "()"]); + }); +}); From 12abf816231be117000e8c2904ae7dfdec4baa80 Mon Sep 17 00:00:00 2001 From: Anders Starcke Henriksen Date: Fri, 4 Aug 2023 15:35:12 +0200 Subject: [PATCH 2/4] Apply suggestions from code review Co-authored-by: Charis Kyriakou --- .../data-extensions-editor/auto-model-v2.test.ts | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/extensions/ql-vscode/test/unit-tests/data-extensions-editor/auto-model-v2.test.ts b/extensions/ql-vscode/test/unit-tests/data-extensions-editor/auto-model-v2.test.ts index acd0388b706..5db12c07fe2 100644 --- a/extensions/ql-vscode/test/unit-tests/data-extensions-editor/auto-model-v2.test.ts +++ b/extensions/ql-vscode/test/unit-tests/data-extensions-editor/auto-model-v2.test.ts @@ -85,7 +85,7 @@ describe("createAutoModelV2Request", () => { }); describe("getCandidates", () => { - it("doesnt return methods that are already modelled", () => { + it("doesn't return methods that are already modelled", () => { const externalApiUsages: ExternalApiUsage[] = []; externalApiUsages.push({ library: "my.jar", @@ -119,7 +119,8 @@ describe("getCandidates", () => { ); expect(candidates.length).toEqual(0); }); - it("doesnt return methods that are supported from other sources", () => { + + it("doesn't return methods that are supported from other sources", () => { const externalApiUsages: ExternalApiUsage[] = []; externalApiUsages.push({ library: "my.jar", @@ -140,7 +141,8 @@ describe("getCandidates", () => { ); expect(candidates.length).toEqual(0); }); - it("return methods that neither modeled nor supported from other sources", () => { + + it("returns methods that are neither modeled nor supported from other sources", () => { const externalApiUsages: ExternalApiUsage[] = []; externalApiUsages.push({ library: "my.jar", @@ -161,6 +163,7 @@ describe("getCandidates", () => { ); expect(candidates.length).toEqual(1); }); + it("respects the limit", () => { const externalApiUsages: ExternalApiUsage[] = []; for (let i = 0; i < 30; i++) { From d4137b2c43414d7c67890212a421490fdc772428 Mon Sep 17 00:00:00 2001 From: Anders Starcke Henriksen Date: Fri, 4 Aug 2023 15:59:25 +0200 Subject: [PATCH 3/4] Address comments. --- .../auto-model-v2.test.ts | 51 ++++++++++--------- 1 file changed, 26 insertions(+), 25 deletions(-) diff --git a/extensions/ql-vscode/test/unit-tests/data-extensions-editor/auto-model-v2.test.ts b/extensions/ql-vscode/test/unit-tests/data-extensions-editor/auto-model-v2.test.ts index 5db12c07fe2..cb87822390c 100644 --- a/extensions/ql-vscode/test/unit-tests/data-extensions-editor/auto-model-v2.test.ts +++ b/extensions/ql-vscode/test/unit-tests/data-extensions-editor/auto-model-v2.test.ts @@ -86,18 +86,19 @@ describe("createAutoModelV2Request", () => { describe("getCandidates", () => { it("doesn't return methods that are already modelled", () => { - const externalApiUsages: ExternalApiUsage[] = []; - externalApiUsages.push({ - library: "my.jar", - signature: "org.my.A#x()", - packageName: "org.my", - typeName: "A", - methodName: "x", - methodParameters: "()", - supported: false, - supportedType: "none", - usages: [], - }); + const externalApiUsages: ExternalApiUsage[] = [ + { + library: "my.jar", + signature: "org.my.A#x()", + packageName: "org.my", + typeName: "A", + methodName: "x", + methodParameters: "()", + supported: false, + supportedType: "none", + usages: [], + }, + ]; const modeledMethods: Record = { "org.my.A#x()": { type: "neutral", @@ -121,18 +122,19 @@ describe("getCandidates", () => { }); it("doesn't return methods that are supported from other sources", () => { - const externalApiUsages: ExternalApiUsage[] = []; - externalApiUsages.push({ - library: "my.jar", - signature: "org.my.A#x()", - packageName: "org.my", - typeName: "A", - methodName: "x", - methodParameters: "()", - supported: true, - supportedType: "none", - usages: [], - }); + const externalApiUsages: ExternalApiUsage[] = [ + { + library: "my.jar", + signature: "org.my.A#x()", + packageName: "org.my", + typeName: "A", + methodName: "x", + methodParameters: "()", + supported: true, + supportedType: "none", + usages: [], + }, + ]; const modeledMethods = {}; const candidates = getCandidates( Mode.Application, @@ -170,7 +172,6 @@ describe("getCandidates", () => { externalApiUsages.push({ library: "my.jar", signature: `org.my.A#x${i}()`, - packageName: "org.my", typeName: "A", methodName: `x${i}`, From 9bd228666017851c4bbf7a2786a2393f1cb23263 Mon Sep 17 00:00:00 2001 From: Anders Starcke Henriksen Date: Mon, 7 Aug 2023 10:21:12 +0200 Subject: [PATCH 4/4] Update extensions/ql-vscode/src/data-extensions-editor/auto-model-codeml-queries.ts Co-authored-by: Aditya Sharad <6874315+adityasharad@users.noreply.github.com> --- .../src/data-extensions-editor/auto-model-codeml-queries.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extensions/ql-vscode/src/data-extensions-editor/auto-model-codeml-queries.ts b/extensions/ql-vscode/src/data-extensions-editor/auto-model-codeml-queries.ts index 9e6909e66c7..8bd2e104744 100644 --- a/extensions/ql-vscode/src/data-extensions-editor/auto-model-codeml-queries.ts +++ b/extensions/ql-vscode/src/data-extensions-editor/auto-model-codeml-queries.ts @@ -265,7 +265,7 @@ export async function generateCandidateFilterPack( version: "0.0.0", library: true, extensionTargets: { - [`codeql/${language}-all`]: "*", + [`codeql/${language}-queries`]: "*", }, dataExtensions: ["filter.yml"], };