diff --git a/.github/ISSUE_TEMPLATE/data-extension-create.yml b/.github/ISSUE_TEMPLATE/data-extension-create.yml new file mode 100644 index 0000000..f74522b --- /dev/null +++ b/.github/ISSUE_TEMPLATE/data-extension-create.yml @@ -0,0 +1,142 @@ +name: Request new CodeQL Data Extension +description: Request a new CodeQL data extension (models-as-data) for an unmodeled library or framework +title: "[Data Extension Create]: " +labels: ["data-extension-create", "enhancement"] +body: + - type: markdown + attributes: + value: | + Thanks for requesting a new CodeQL data extension! This template helps Copilot Coding Agent understand your requirements. + + - type: dropdown + id: target-language + attributes: + label: Target Language + description: Which programming language should this data extension target? + options: + - cpp + - csharp + - go + - java + - javascript + - python + - ruby + default: 0 + validations: + required: true + + - type: input + id: library-url + attributes: + label: Library Repository / Documentation URL + description: "Link to the library's source code or API documentation. A GitHub repository URL is ideal โ€” it allows the agent to browse the source code directly to identify sources, sinks, and summaries." + placeholder: "e.g., https://github.com/databricks/databricks-sql-python" + validations: + required: true + + - type: input + id: extension-name + attributes: + label: Data Extension Name (Optional) + description: "Extension name (e.g., databricks-sql.model.yml). Use -.model.yml naming. If the library has multiple modules/sub-packages (e.g., library-core, library-web, library-api), create separate model files per module." + placeholder: "e.g., databricks-sql.model.yml, django-http.model.yml" + validations: + required: false + + - type: textarea + id: library-modules + attributes: + label: Library Modules / Components + description: "If the library has distinct modules or sub-packages, list them here. Each module may become a separate model file (e.g., library-core.model.yml, library-web.model.yml). Include the import paths or package names." + placeholder: | + - databricks.sql (SQL connector: connect, cursor, execute) + - databricks.sdk (SDK client: WorkspaceClient, jobs, clusters) + - databricks.connect (Spark session bridge) + validations: + required: false + + - type: textarea + id: description + attributes: + label: Data Extension Description + description: "Describe the library/framework to model. What methods are sources of untrusted data? What methods are security-sensitive sinks? What methods sanitize data (barriers) or validate data (barrier guards)? All applicable model types (sourceModel, sinkModel, summaryModel, barrierModel, barrierGuardModel, typeModel, neutralModel) will be generated automatically." + placeholder: | + Library: databricks-sql-connector + - Sources: None (uses Flask request sources) + - Sinks: cursor.execute(query) is a SQL injection sink + - Summaries: connect() returns a connection, connection.cursor() returns a cursor + - Barriers: db_escape(value) sanitizes output for SQL injection + - Barrier Guards: is_safe_query(query) returns true when query is safe for SQL injection + + Docs: https://docs.databricks.com/... + validations: + required: true + + - type: textarea + id: examples + attributes: + label: Code Examples + description: Provide sample end to code that should be detected + placeholder: | + ```java + package org.example; + + # Undertow is not supported out of the box + import io.undertow.Undertow; + import io.undertow.server.HttpHandler; + import io.undertow.server.HttpServerExchange; + import io.undertow.util.Headers; + import java.util.Deque; + import javax.crypto.Cipher; + + public class App { + public String getGreeting() { + return "Hello World!"; + } + + public static void main(String[] args) { + System.out.println(new App().getGreeting()); + try { + Runtime.getRuntime().exec("ls"); + Cipher rsanopad = Cipher.getInstance("RSA/ECB/NoPadding"); + } catch (Exception e) { + System.out.println(e.getMessage()); + } + + Undertow server = Undertow.builder() + .addHttpListener(8080, "localhost") + .setHandler(new HttpHandler() { + @Override + public void handleRequest(final HttpServerExchange exchange) throws Exception { + String name = "world"; + Deque res = exchange.getQueryParameters().get("namex"); // SOURCE + if (res != null) { + name = res.getFirst(); + } + exchange.getResponseHeaders().put(Headers.CONTENT_TYPE, "text/html"); + exchange.getResponseSender().send("Hello " + name + ""); // SINK XSS + } + }).build(); + server.start(); + } + } + ``` + validations: + required: false + + - type: input + id: references + attributes: + label: Additional References (Optional) + description: "Any other links โ€” API docs, CWE references, related CodeQL queries, or security advisories." + placeholder: "e.g., https://docs.databricks.com/sql/connector.html" + validations: + required: false + + - type: checkboxes + id: terms + attributes: + label: Code of Conduct + options: + - label: I agree to follow this project's Code of Conduct + required: true diff --git a/.github/PULL_REQUEST_TEMPLATE/data-extension-create.md b/.github/PULL_REQUEST_TEMPLATE/data-extension-create.md new file mode 100644 index 0000000..495d434 --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE/data-extension-create.md @@ -0,0 +1,83 @@ +--- +name: ๐Ÿ“ฆ New CodeQL Data Extension +about: Pull request for creating a new CodeQL data extension model +title: '[NEW DATA EXTENSION] ' +labels: + - data-extension-create + - enhancement +--- + +## ๐Ÿ“ Data Extension Information + +- **Language**: +- **Extension Name(s)**: +- **Extension Types**: +- **Target Library/Framework**: +- **Library Modules Covered**: + +## ๐ŸŽฏ Description + +### What This Data Extension Models + + + +### Threat Model + + + +### Example Vulnerable Code + +```[language] +// Code that should be detected with this data extension +``` + +### Example Safe Code + +```[language] +// Code that should NOT be detected +``` + +## ๐Ÿ“ฆ Extension Details + +### Extension YAML + + + +```yaml +extensions: + - addsTo: + pack: codeql/[language]-all + extensible: sinkModel + data: + # - ["package","Member[...].Argument[0]","sink-kind"] +``` + +### Access Path Explanation + + + +## ๐Ÿงช Testing + +- [ ] Extension YAML resolves without errors +- [ ] Database created with sample code (`codeql database create` or `codeql test extract`) +- [ ] Single query verified with extension applied (`codeql query run --additional-packs=`) +- [ ] Unit tests pass with extension applied (`codeql test run --additional-packs=`) +- [ ] Positive test cases (vulnerable code detected) +- [ ] Negative test cases (safe code not flagged) + +## ๐Ÿ“‹ Checklist + +- [ ] Extension YAML is valid and properly formatted +- [ ] Extension placed in correct location (`languages/[language]/custom/src/`) +- [ ] `qlpack.yml` includes `dataExtensions` configuration +- [ ] Access paths verified via API graph queries +- [ ] No false positives in test cases +- [ ] Documentation/comments included in YAML + +## ๐Ÿ”— References + + + +--- + +**Note**: This data extension was developed following CodeQL Models as Data best practices. diff --git a/.github/prompts/cpp_data_extension_development.prompt.md b/.github/prompts/cpp_data_extension_development.prompt.md new file mode 100644 index 0000000..fdfbc90 --- /dev/null +++ b/.github/prompts/cpp_data_extension_development.prompt.md @@ -0,0 +1,207 @@ +--- +mode: agent +--- + +# C / C++ Data Extension + +For general CodeQL data extension model development guidance, see [Common Data Extension Development](./data_extensions_development.prompt.md). +If you need to write a custom CodeQL query instead of a data extension, see [Common Query Development](./query_development.prompt.md). + +## C/C++-Specific Documentation + +### Documentation + +- [Customizing Library Models for C and C++](https://codeql.github.com/docs/codeql-language-guides/customizing-library-models-for-cpp/) + - Can also be found at [Customizing Library Models for C and C++ Docs](https://github.com/github/codeql/blob/main/docs/codeql/codeql-language-guides/customizing-library-models-for-cpp.rst) + +### Model Format + +C/C++ uses a **MaD (Models as Data)** format with **9-10 column tuples**. Same structural pattern as Java/Kotlin, C#, and Go, but with namespace-based identification and pointer indirection support. + +The pack name is `codeql/cpp-all`. + +#### Extensible predicates + +| Predicate | Columns | Purpose | +| ------------------- | -------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------ | +| `sourceModel` | `(namespace, type, subtypes, name, signature, ext, output, kind, provenance)` | Model sources of tainted data | +| `sinkModel` | `(namespace, type, subtypes, name, signature, ext, input, kind, provenance)` | Model sinks | +| `summaryModel` | `(namespace, type, subtypes, name, signature, ext, input, output, kind, provenance)` | Model flow through functions | +| `barrierModel` | `(namespace, type, subtypes, name, signature, ext, output, kind, provenance)` | Model barriers (sanitizers) that stop taint flow | +| `barrierGuardModel` | `(namespace, type, subtypes, name, signature, ext, input, acceptingValue, kind, provenance)` | Model barrier guards (validators) that stop taint via conditional checks | + +**Note:** C/C++ does **not** currently support `neutralModel`. + +#### Tuple column reference + +| Column | Description | Example | +| ---------------- | ------------------------------------------------------------------------------------ | --------------------------------- | +| `namespace` | C++ namespace (use `""` for global namespace) | `"boost::asio"`, `""` | +| `type` | Class name (use `""` for free functions) | `""`, `"Socket"` | +| `subtypes` | Whether model applies to overrides (`True`/`False`). Use `False` for free functions. | `False` | +| `name` | Function or method name | `"read_until"`, `"write"` | +| `signature` | Can narrow between overloaded functions. Use `""` to match all overloads. | `""` | +| `ext` | Leave empty (`""`) | `""` | +| `input`/`output` | Access path (supports pointer indirection via `*`) | `"Argument[*1]"`, `"ReturnValue"` | +| `kind` | Source/sink/summary kind | `"remote"`, `"remote-sink"` | +| `provenance` | Origin of the model | `"manual"` | + +#### Important: C/C++-specific rules + +- **Pointer indirection**: Use the `*` prefix on argument indices to dereference pointers. `Argument[*1]` means "the pointed-to value of the second argument." +- **Free functions** have `type` = `""` and `subtypes` = `False` +- **Namespace nesting**: Use `::` separator (e.g., `"boost::asio"`) +- **Global namespace** functions use `""` for the namespace column +- **Signature column** can be used to disambiguate overloaded functions, but `""` matches all overloads + +### Access Paths + +| Component | Description | +| ---------------- | -------------------------------------------------- | +| `Argument[n]` | Argument at index n (0-based, the value itself) | +| `Argument[*n]` | First indirection (pointed-to value) of argument n | +| `ReturnValue` | Return value of the function | +| `ReturnValue[*]` | Pointed-to value of the return value | + +### Sink Kinds + +`sql-injection`, `command-injection`, `path-injection`, `remote-sink` (data transmitted across network), `format-string` (uncontrolled format strings) + +### Sample Model + +Given a snippet using `boost::asio`: + +```cpp +boost::asio::write(socket, send_buffer, error); // sink: data sent over network +``` + +`boost_asio.model.yml` + +```yaml +extensions: + - addsTo: + pack: codeql/cpp-all + extensible: sourceModel + data: [] + + - addsTo: + pack: codeql/cpp-all + extensible: sinkModel + data: + - ['boost::asio', '', False, 'write', '', '', 'Argument[*1]', 'remote-sink', 'manual'] + + - addsTo: + pack: codeql/cpp-all + extensible: summaryModel + data: [] + + - addsTo: + pack: codeql/cpp-all + extensible: barrierModel + data: [] + + - addsTo: + pack: codeql/cpp-all + extensible: barrierGuardModel + data: [] +``` + +### Example: Source from Network Read + +```yaml +extensions: + - addsTo: + pack: codeql/cpp-all + extensible: sourceModel + data: + - ['boost::asio', '', False, 'read_until', '', '', 'Argument[*1]', 'remote', 'manual'] +``` + +Note: `Argument[*1]` means the **pointed-to value** of the second argument (the buffer being filled with network data). + +### Example: Flow Through `boost::asio::buffer` + +```yaml +extensions: + - addsTo: + pack: codeql/cpp-all + extensible: summaryModel + data: + - [ + 'boost::asio', + '', + False, + 'buffer', + '', + '', + 'Argument[*0]', + 'ReturnValue', + 'taint', + 'manual' + ] +``` + +### Example: Taint Barrier Using `mysql_real_escape_string` + +The `mysql_real_escape_string` function escapes special characters in a string for use in SQL statements, preventing SQL injection. The escaped output (written to the second argument's pointed-to value) is safe. + +```cpp +char *escaped_name = new char[2 * strlen(name) + 1]; +mysql_real_escape_string(mysql, escaped_name, name, strlen(name)); // escaped_name is safe for SQL +``` + +```yaml +extensions: + - addsTo: + pack: codeql/cpp-all + extensible: barrierModel + data: + - [ + '', + '', + False, + 'mysql_real_escape_string', + '', + '', + 'Argument[*1]', + 'sql-injection', + 'manual' + ] +``` + +Note: `Argument[*1]` means the **pointed-to value** of the second argument โ€” the output buffer that receives the escaped string. The `kind` `"sql-injection"` must match the sink kind used by SQL injection queries. + +### Example: Barrier Guard Using a Validation Function + +A barrier guard models a function that returns a boolean indicating whether data is safe. When the function returns the expected value, taint flow is stopped through the guarded branch. + +```cpp +if (is_safe(user_input)) { // The check guards the use + mysql_query(user_input); // This is safe +} +``` + +```yaml +extensions: + - addsTo: + pack: codeql/cpp-all + extensible: barrierGuardModel + data: + - ['', '', False, 'is_safe', '', '', 'Argument[*0]', 'true', 'sql-injection', 'manual'] +``` + +Note: The `acceptingValue` `"true"` means the barrier applies when `is_safe` returns true. The `input` `"Argument[*0]"` identifies the value being validated (the pointed-to value of the first argument). + +### Key Differences from Other Languages + +| Aspect | C/C++ | Java/C#/Go | +| ------------------- | ----------------------------------------------------- | --------------------------------------- | +| Pack name | `codeql/cpp-all` | `codeql/java-all`, etc. | +| Identifier column 1 | `namespace` (C++ namespace) | `package`/`namespace` | +| Pointer indirection | `Argument[*n]` for dereferenced pointers | Not applicable | +| `neutralModel` | Not supported | Supported | +| Receiver access | Not applicable (C++ uses `Argument[this]` if modeled) | `Argument[this]` / `Argument[receiver]` | + +### Additional References + +- **[C/C++ Reference](./cpp_query_development.prompt.md)** - C/C++ query development diff --git a/.github/prompts/csharp_data_extension_development.prompt.md b/.github/prompts/csharp_data_extension_development.prompt.md new file mode 100644 index 0000000..9ada2c9 --- /dev/null +++ b/.github/prompts/csharp_data_extension_development.prompt.md @@ -0,0 +1,318 @@ +--- +mode: agent +--- + +# C# Data Extension + +For general CodeQL data extension model development guidance, see [Common Data Extension Development](./data_extensions_development.prompt.md). +If you need to write a custom CodeQL query instead of a data extension, see [Common Query Development](./query_development.prompt.md). + +## C#-Specific Documentation + +### Documentation + +- [Customizing Library Models for C#](https://codeql.github.com/docs/codeql-language-guides/customizing-library-models-for-csharp/) + - Can also be found at [Customizing Library Models for C# Docs](https://github.com/github/codeql/blob/main/docs/codeql/codeql-language-guides/customizing-library-models-for-csharp.rst) + +### Model Format + +C# uses a **MaD (Models as Data)** format with **9-10 column tuples** that identify callables by fully qualified namespace, type, method name, and signature. This is the same structural pattern as Java/Kotlin and Go. + +The pack name is `codeql/csharp-all`. + +#### Extensible predicates + +| Predicate | Columns | Purpose | +| ------------------- | -------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------ | +| `sourceModel` | `(namespace, type, subtypes, name, signature, ext, output, kind, provenance)` | Model sources of tainted data | +| `sinkModel` | `(namespace, type, subtypes, name, signature, ext, input, kind, provenance)` | Model sinks | +| `summaryModel` | `(namespace, type, subtypes, name, signature, ext, input, output, kind, provenance)` | Model flow through methods | +| `barrierModel` | `(namespace, type, subtypes, name, signature, ext, output, kind, provenance)` | Model barriers (sanitizers) that stop taint flow | +| `barrierGuardModel` | `(namespace, type, subtypes, name, signature, ext, input, acceptingValue, kind, provenance)` | Model barrier guards (validators) that stop taint via conditional checks | +| `neutralModel` | `(namespace, type, name, signature, kind, provenance)` | Mark methods as having no dataflow impact | + +#### Tuple column reference + +| Column | Description | Example | +| ---------------- | ----------------------------------------------------------------------------------------------- | ------------------------------------------------------- | +| `namespace` | Fully qualified namespace | `"System.Data.SqlClient"` | +| `type` | Class or interface name | `"SqlCommand"` | +| `subtypes` | Whether model applies to overrides (`True`/`False`) | `False` | +| `name` | Method/property name. Constructors use the class name. Getters: `get_Name`, Setters: `set_Name` | `"SqlCommand"`, `"get_Now"` | +| `signature` | Fully qualified parameter types in parentheses | `"(System.String,System.Data.SqlClient.SqlConnection)"` | +| `ext` | Leave empty (`""`) | `""` | +| `input`/`output` | Access path | `"Argument[0]"`, `"ReturnValue"` | +| `kind` | Source/sink/summary kind | `"sql-injection"`, `"taint"` | +| `provenance` | Origin of the model | `"manual"` | + +#### Important: C#-specific signature rules + +- Type names must be **fully qualified**: `System.String`, not `string` +- Generic type parameters must match source code names: `Select` +- Generics in signatures must match: `(System.Collections.Generic.IEnumerable,System.Func)` +- Property getters/setters are modeled as `get_PropertyName`/`set_PropertyName` +- Constructors use the class name (e.g., `"SqlCommand"`) + +### Access Paths + +| Component | Description | +| ----------------- | -------------------------------------------- | +| `Argument[n]` | Argument at index n (0-based) | +| `Argument[this]` | The qualifier/receiver of a method call | +| `Argument[n1,n2]` | Shorthand for multiple arguments | +| `ReturnValue` | Return value of the method | +| `Element` | Elements of a collection (e.g., IEnumerable) | +| `Parameter[n]` | Parameter at index n of a delegate/lambda | +| `Field[name]` | Named field | +| `Property[name]` | Named property | + +### Sink Kinds + +`sql-injection`, `command-injection`, `code-injection`, `path-injection`, `url-redirection`, `log-injection`, `request-forgery`, `xpath-injection`, `ldap-injection` + +### Threat Models (C#-specific) + +In addition to `remote` and `local`, C# supports: + +- `file-write` โ€” opening a file in write mode +- `windows-registry` โ€” Windows registry values (C# only) + +### Sample Model + +Given a snippet where the `SqlCommand` constructor takes a SQL string: + +```csharp +public static void TaintSink(SqlConnection conn, string query) { + SqlCommand command = new SqlCommand(query, conn); // sink: SQL injection +} +``` + +`sqlclient.model.yml` + +```yaml +extensions: + - addsTo: + pack: codeql/csharp-all + extensible: sourceModel + data: [] + + - addsTo: + pack: codeql/csharp-all + extensible: sinkModel + data: + - [ + 'System.Data.SqlClient', + 'SqlCommand', + False, + 'SqlCommand', + '(System.String,System.Data.SqlClient.SqlConnection)', + '', + 'Argument[0]', + 'sql-injection', + 'manual' + ] + + - addsTo: + pack: codeql/csharp-all + extensible: summaryModel + data: [] + + - addsTo: + pack: codeql/csharp-all + extensible: barrierModel + data: [] + + - addsTo: + pack: codeql/csharp-all + extensible: barrierGuardModel + data: [] + + - addsTo: + pack: codeql/csharp-all + extensible: neutralModel + data: [] +``` + +### Example: Remote Source from Network Stream + +```yaml +extensions: + - addsTo: + pack: codeql/csharp-all + extensible: sourceModel + data: + - [ + 'System.Net.Sockets', + 'TcpClient', + False, + 'GetStream', + '()', + '', + 'ReturnValue', + 'remote', + 'manual' + ] +``` + +### Example: Flow Through `String.Concat` + +```yaml +extensions: + - addsTo: + pack: codeql/csharp-all + extensible: summaryModel + data: + - [ + 'System', + 'String', + False, + 'Concat', + '(System.Object,System.Object)', + '', + 'Argument[0,1]', + 'ReturnValue', + 'taint', + 'manual' + ] +``` + +Note: `Argument[0,1]` is shorthand for both `Argument[0]` and `Argument[1]`. + +### Example: Flow Through `String.Trim` (Instance Method) + +```yaml +extensions: + - addsTo: + pack: codeql/csharp-all + extensible: summaryModel + data: + - [ + 'System', + 'String', + False, + 'Trim', + '()', + '', + 'Argument[this]', + 'ReturnValue', + 'taint', + 'manual' + ] +``` + +### Example: Flow Through LINQ `Select` (Higher-Order + Generics) + +```yaml +extensions: + - addsTo: + pack: codeql/csharp-all + extensible: summaryModel + data: + - [ + 'System.Linq', + 'Enumerable', + False, + 'Select', + '(System.Collections.Generic.IEnumerable,System.Func)', + '', + 'Argument[0].Element', + 'Argument[1].Parameter[0]', + 'value', + 'manual' + ] + - [ + 'System.Linq', + 'Enumerable', + False, + 'Select', + '(System.Collections.Generic.IEnumerable,System.Func)', + '', + 'Argument[1].ReturnValue', + 'ReturnValue.Element', + 'value', + 'manual' + ] +``` + +Note: Two rows model the two-step flow: collection elements into the lambda parameter, then from the lambda return value into the output collection elements. Generic type parameter names must match the source code. + +### Example: Neutral Model (Property Getter) + +```yaml +extensions: + - addsTo: + pack: codeql/csharp-all + extensible: neutralModel + data: + - ['System', 'DateTime', 'get_Now', '()', 'summary', 'manual'] +``` + +### Example: Barrier for URL Redirection + +The `RawUrl` property of `HttpRequest` returns the raw URL of the current request, which is safe for URL redirects because it cannot be manipulated by an attacker. + +```csharp +public static void TaintBarrier(HttpRequest request) { + string url = request.RawUrl; // Safe for URL redirects + Response.Redirect(url); // Not a URL redirection vulnerability +} +``` + +```yaml +extensions: + - addsTo: + pack: codeql/csharp-all + extensible: barrierModel + data: + - [ + 'System.Web', + 'HttpRequest', + False, + 'get_RawUrl', + '()', + '', + 'ReturnValue', + 'url-redirection', + 'manual' + ] +``` + +Note: Property getters are modeled as `get_PropertyName`. The `kind` `"url-redirection"` must match the sink kind used by URL redirection queries. + +### Example: Barrier Guard for URL Validation + +The `IsAbsoluteUri` property of `Uri` returns `false` when the URL is relative and therefore safe for URL redirects. + +```csharp +public static void TaintBarrierGuard(Uri uri) { + if (!uri.IsAbsoluteUri) { // The check guards the redirect + Response.Redirect(uri.ToString()); // Safe + } +} +``` + +```yaml +extensions: + - addsTo: + pack: codeql/csharp-all + extensible: barrierGuardModel + data: + - [ + 'System', + 'Uri', + False, + 'get_IsAbsoluteUri', + '()', + '', + 'Argument[this]', + 'false', + 'url-redirection', + 'manual' + ] +``` + +Note: The `acceptingValue` `"false"` means the barrier applies when `IsAbsoluteUri` is false (the URL is relative). The `input` `"Argument[this]"` identifies the qualifier (`uri`) whose taint flow is blocked. + +### Additional References + +- **[C# Reference](./csharp_query_development.prompt.md)** - C# query development diff --git a/.github/prompts/data_extensions_development.prompt.md b/.github/prompts/data_extensions_development.prompt.md new file mode 100644 index 0000000..f8e9c35 --- /dev/null +++ b/.github/prompts/data_extensions_development.prompt.md @@ -0,0 +1,489 @@ +--- +mode: agent +--- + +# CodeQL Data Extensions / Models as Data / Model Packs + +This prompt provides common guidance for developing CodeQL data extensions across all supported languages, while language-specific prompts reference this common guidance and add language-specific details. + +## Product Documentation + +- [Extending coverage for a repository](https://docs.github.com/en/code-security/how-tos/scan-code-for-vulnerabilities/manage-your-configuration/editing-your-configuration-of-default-setup#extending-coverage-for-a-repository) - `.github/codeql/extensions directory` for local model pack references (does not need a qlpack.yml) +- [Extending coverage for all repositories in an organization](https://docs.github.com/en/code-security/how-tos/scan-code-for-vulnerabilities/manage-your-configuration/editing-your-configuration-of-default-setup#extending-coverage-for-all-repositories-in-an-organization) - publishing model packs and referencing them globally (must be done click button in UI) +- [Creating a CodeQL model pack](https://docs.github.com/en/code-security/tutorials/customize-code-scanning/creating-and-working-with-codeql-packs?versionId=free-pro-team%40latest&productId=code-security&restPage=how-tos%2Cscan-code-for-vulnerabilities%2Cmanage-your-configuration%2Cediting-your-configuration-of-default-setup#creating-a-codeql-model-pack) - publishing a model pack + for dataExtensions via qlpack.yml + +## Core Principles + +CodeQL analysis can be customized by adding library models in data extension YAML files to recognize libraries and frameworks that are not supported by default. +Model packs can be used to expand code scanning analysis at scale. Model packs use data extensions, which are implemented as YAML and describe how to add data for new dependencies. When a model pack is specified, the data extensions in that pack will be added to the code scanning analysis automatically. + +Generally each language will allow customization of the following extensible predicates: + +- sourceModel - This is used to model sources of potentially tainted data. The `kind` of the sources defined using this predicate determine which **threat model** they are associated with (e.g., `remote`, `local`, `file`, `commandargs`). Different threat models can be used to customize the sources used in an analysis. +- sinkModel - This is used to model sinks where tainted data maybe used in a way that makes the code vulnerable. The `kind` identifies the vulnerability class (e.g., `sql-injection`, `command-injection`). +- summaryModel - This is used to model flow through elements. The `kind` is either `taint` (derived value) or `value` (same value). +- neutralModel - This is similar to a summary model but used to model the flow of values that have only a minor impact on the dataflow analysis. Used to override incorrect auto-generated models. +- barrierModel - This is used to model barriers (sanitizers), which are elements that stop the flow of taint for a specified query kind. For example, an HTML-escaping function that prevents cross-site scripting. The `kind` must match the corresponding sink kind (e.g., `sql-injection`, `html-injection`). Available since CodeQL 2.25.2. +- barrierGuardModel - This is used to model barrier guards (validators), which are elements that return a boolean indicating whether data is safe. When the conditional check returns the specified `acceptingValue` (e.g., `"true"` or `"false"`), taint flow is stopped through guarded branches. The `kind` must match the corresponding sink kind. Available since CodeQL 2.25.2. +- typeModel - Only available in **API Graph languages** (Python, Ruby, JavaScript/TypeScript). Defines type relationships so that models for a parent type automatically apply to subtypes. MaD languages (Java/Kotlin, C#, Go, C/C++) handle subtyping via the `subtypes` boolean column in their tuples instead. + +### What to Model in a Library + +When reviewing a library or framework's documentation/API surface, identify the following categories of methods. All are important โ€” sources, sinks, and summaries work together to form a complete taint-tracking path. Missing any one of them can break the chain and cause false negatives. + +#### How to read a library's API for modeling + +Given a library's documentation, ask these questions for each public method, function, or class: + +1. **Does this method return data from an external source?** (network, filesystem, user input, environment) โ†’ **Source** +2. **Does this method consume data in a security-sensitive operation?** (execute SQL, run a shell command, write to a file path, redirect a URL) โ†’ **Sink** +3. **Does this method pass data through without CodeQL being able to see the implementation?** (transform, encode, decode, copy, wrap, unwrap, iterate) โ†’ **Summary** +4. **Does this method sanitize data so its output is safe for a specific vulnerability type?** (HTML-escape, SQL-escape, path canonicalization, URL encoding) โ†’ **Barrier** +5. **Does this method return a boolean indicating whether data is safe to use?** (URL validation, input format checking, allowlist matching) โ†’ **Barrier Guard** +6. **Is this type a subclass or variant of another type we've already modeled?** โ†’ **Type model** +7. **Has CodeQL's model generator incorrectly flagged this method as having flow?** โ†’ **Neutral** + +#### Sources (sourceModel) + +Sources are methods that return data from outside the application boundary. Without source models, taint tracking has no starting point. + +Look for methods that: + +- Read from HTTP requests (parameters, headers, body, cookies, URL) +- Read from WebSocket/gRPC/messaging channels +- Read from files, stdin, environment variables, command-line arguments +- Read from databases or caches +- Deserialize external data (JSON, XML, YAML, Protobuf) + +The `kind` column determines the threat model category โ€” see the Threat Models section below. + +#### Sinks (sinkModel) + +Sinks are methods that consume data in a way that can cause a vulnerability if the data is attacker-controlled. Without sink models, CodeQL cannot flag the vulnerability even if tainted data reaches the dangerous call. + +Look for methods that: + +- Execute SQL or NoSQL queries +- Execute OS commands or shell scripts +- Evaluate code dynamically (eval, template rendering) +- Access filesystem paths +- Redirect users to URLs +- Construct LDAP/XPath/regex queries from input +- Send data over the network (cleartext transmission) +- Deserialize untrusted data into objects + +Each sink kind maps to a specific vulnerability class: + +| Sink Kind | Vulnerability | Example | +| ------------------------ | ---------------------------------- | ------------------------------------ | +| `sql-injection` | SQL Injection (CWE-089) | `cursor.execute(query)` | +| `command-injection` | OS Command Injection (CWE-078) | `subprocess.run(cmd)` | +| `code-injection` | Code Injection (CWE-094) | `eval(expr)` | +| `path-injection` | Path Traversal (CWE-022) | `open(filepath)` | +| `url-redirection` | Open Redirect (CWE-601) | `redirect(url)` | +| `log-injection` | Log Injection (CWE-117) | `logger.info(msg)` | +| `request-forgery` | SSRF (CWE-918) | `fetch(url)` | +| `nosql-injection` | NoSQL Injection | `collection.find(query)` | +| `xpath-injection` | XPath Injection | XPath query construction | +| `ldap-injection` | LDAP Injection | LDAP search filter construction | +| `html-injection` | XSS (CWE-079) | DOM manipulation (JS only) | +| `unsafe-deserialization` | Insecure Deserialization (CWE-502) | Unsafe YAML/pickle parsing (JS only) | +| `remote-sink` | Cleartext Transmission (CWE-319) | Network write (C/C++ only) | + +Not all sink kinds are available in all languages โ€” see language-specific prompts for details. + +#### Summaries (summaryModel) + +Summaries describe how taint propagates **through** a method call. Without summaries, taint tracking loses track of data as it passes through library/framework code, causing false negatives. + +Look for methods that: + +- Transform data (encode, decode, escape, unescape, serialize, deserialize) +- Copy or wrap data (constructors, builders, factory methods) +- Pass data through collections (add to list, get from map, iterate) +- Concatenate, split, or format strings +- Chain or compose operations (middleware, decorators, pipes) + +Two summary kinds: + +- `taint` โ€” the output is derived from the input but not necessarily identical (e.g., string concatenation, encoding, parsing). Use this for most cases. +- `value` โ€” the output is the same value or a direct copy (e.g., getter, identity transform, collection element access). Preserves all properties of the original value. + +**When to model summaries:** Focus on methods that sit on the path between a source and a sink. If taint already flows end-to-end without a summary, you don't need one. + +#### Types (typeModel) + +Type models define relationships between types (e.g., "this subclass should inherit all models from its parent"). Useful to avoid duplicating sink/source/summary models across related classes. + +**Supported by:** Python, Ruby, JavaScript/TypeScript (API Graph languages only) + +**Not available in:** Java/Kotlin, C#, Go, C/C++ โ€” these MaD languages handle subtyping through the `subtypes` boolean column in their source/sink/summary tuples. Setting `subtypes: True` makes the model apply to all overrides and implementations of the specified method. + +#### Barriers (barrierModel) + +Barriers model sanitizer functions โ€” methods whose output is considered safe for a specific vulnerability type. A barrier stops taint flow at the modeled element. The `kind` value must match the sink kind used by the query where the barrier should take effect (e.g., `sql-injection`, `html-injection`, `path-injection`, `url-redirection`, `request-forgery`). + +Look for methods that: + +- Escape or encode output for a specific context (HTML-escape, SQL-escape, shell-escape) +- Canonicalize or normalize paths to prevent traversal +- Encode data to prevent injection (URL encoding, base64 for safe contexts) +- Strip or sanitize dangerous characters from input + +#### Barrier Guards (barrierGuardModel) + +Barrier guards model validator functions โ€” methods that return a boolean indicating whether data is safe to use. When the function returns the expected `acceptingValue` (typically `"true"` or `"false"`), taint flow is stopped through the guarded branch. The `kind` must match the corresponding sink kind. + +Look for methods that: + +- Validate URLs (e.g., check if relative, check against allowlist) +- Check input format or pattern (e.g., is numeric, matches regex) +- Verify data integrity or safety (e.g., signature validation, allowlist check) +- Return a boolean that gates subsequent security-sensitive operations + +#### Neutrals (neutralModel) + +Neutral models explicitly mark a method as having no taint flow. Their primary purpose is to **override auto-generated models** โ€” if CodeQL's model generator (`df-generated` provenance) incorrectly assigned a summary to a method, a manual neutral model suppresses it. They also have a minor effect on dataflow dispatch. Generally only needed when curating generated models. + +### Threat Models + +### Model File Organization + +When a library or framework has distinct modules, packages, or sub-libraries, split models into **separate YAML files per module** rather than putting everything in one file. This makes models easier to review, maintain, and test independently. + +For example, if modeling the `databricks` ecosystem: + +``` +models/ + databricks-sql.model.yml # databricks.sql module (connect, cursor, execute) + databricks-sdk.model.yml # databricks SDK client methods + databricks-connect.model.yml # databricks-connect Spark session +``` + +Or for a web framework like Django: + +``` +models/ + django-http.model.yml # django.http request/response sources + django-db.model.yml # django.db ORM sinks (raw SQL) + django-shortcuts.model.yml # django.shortcuts (redirect sinks) + django-template.model.yml # django.template (template injection sinks) +``` + +Naming convention: `-.model.yml` (lowercase, hyphen-separated). + +All `.model.yml` files within a model pack are automatically picked up via the `dataExtensions` glob in `qlpack.yml` (e.g., `dataExtensions: models/**/*.yml`). + +### Common Workflows + +Data extensions support three primary workflows. An agent should follow the appropriate procedure end-to-end rather than jumping straight to YAML authoring. + +#### Workflow 1: Creating a new `.model.yml` + +1. **Identify the library to model** โ€” review the library's API documentation or source code and classify public methods as sources, sinks, summaries, barriers, or barrier guards (see "What to Model in a Library" above) +2. **Determine the correct format** โ€” check whether the target language uses API Graph (Python, Ruby, JS/TS) or MaD (Java/Kotlin, C#, Go, C/C++) tuples (see "Two Model Formats" below) +3. **Create the YAML file** โ€” use the naming convention `-.model.yml` and the appropriate column format for the language +4. **Place the file** โ€” choose one of two paths depending on scope: + - **Single repository:** Place the `.model.yml` directly in `.github/codeql/extensions//` โ€” no `qlpack.yml` is needed; Code Scanning picks up extensions from this directory automatically + - **Model pack (reusable across repos):** Place the file under a pack directory (e.g., `languages//custom/src/`) with a `qlpack.yml` that declares `extensionTargets` and `dataExtensions` +5. **Test locally** โ€” run a targeted query against a sample database to confirm new findings appear (see "Model Pack / Data Extension Options" below for `--additional-packs` usage): + ```bash + codeql query run \ + --database=/path/to/db \ + --additional-packs= \ + --output=results.bqrs \ + -- path/to/RelevantQuery.ql + ``` +6. **Validate results** โ€” decode and inspect results with `codeql bqrs decode`; confirm expected findings appear and no false positives are introduced + +#### Workflow 2: Updating an existing `.model.yml` + +1. **Find the existing model file** โ€” check these locations in order: + - `.github/codeql/extensions/` in the current repository + - `languages//custom/src/` in this template repository + - Published model packs (search GHCR or your org's CodeQL pack registry) + - **Note:** Models in upstream `codeql/-all` packs cannot be edited directly โ€” create a custom model pack that adds new rows alongside the built-in models +2. **Add new rows** to the appropriate extensible predicate section (`sinkModel`, `sourceModel`, `summaryModel`, etc.) โ€” do not remove existing rows unless they are incorrect +3. **Maintain consistency** โ€” match the existing formatting, column count, and provenance values in the file +4. **Re-test** โ€” run the same query or test suite that covers the library to confirm: + - Existing findings are unchanged (no regressions) + - New coverage produces expected results +5. **Bump the version** โ€” if the model file lives in a published model pack, increment the `version` field in `qlpack.yml` before publishing + +#### Workflow 3: Publishing a model pack to GHCR + +1. **Ensure `qlpack.yml` is configured correctly:** + ```yaml + name: /- + version: 1.0.0 + library: true + extensionTargets: + codeql/-all: '*' + dataExtensions: + - models/**/*.yml + ``` +2. **Run `codeql pack publish`** to push the pack to the GitHub Container Registry +3. **Configure for org-wide Default Setup** โ€” in the GitHub organization settings, navigate to Code security โ†’ Default setup โ†’ Model packs and add `/-` (see [Extending coverage for all repositories in an organization](https://docs.github.com/en/code-security/how-tos/find-and-fix-code-vulnerabilities/manage-your-configuration/editing-your-configuration-of-default-setup#extending-codeql-coverage-with-codeql-model-packs-in-default-setup)) +4. **For updates to an already-published pack** โ€” increment the `version` in `qlpack.yml`, then re-run `codeql pack publish`; Default Setup will pick up the new version automatically based on the version range configured + +### Two Model Formats: API Graph vs MaD + +CodeQL data extensions use one of two tuple formats depending on the language. Using the wrong format for a language will produce invalid extensions. + +#### API Graph format (short tuples) + +Used by: **Python**, **Ruby**, **JavaScript/TypeScript** + +Tuples identify targets by a **type** string and an **access path** that navigates the API graph. Tuples are compact (3-5 columns). + +```yaml +# sinkModel(type, path, kind) โ€” 3 columns +- [ + 'databricks', + 'Member[sql].Member[connect].ReturnValue.Member[cursor].ReturnValue.Member[execute].Argument[0]', + 'sql-injection' + ] + +# summaryModel(type, path, input, output, kind) โ€” 5 columns +- ['global', 'Member[decodeURIComponent]', 'Argument[0]', 'ReturnValue', 'taint'] + +# barrierModel(type, path, kind) โ€” 3 columns +- ['html', 'Member[escape].ReturnValue', 'html-injection'] + +# barrierGuardModel(type, path, acceptingValue, kind) โ€” 4 columns +- ['my-package', 'Member[isValid].Argument[0]', 'true', 'sql-injection'] +``` + +- The `type` column is a starting point (package name, class name, or `"global"`) +- The `path` column is a `.`-separated chain of API graph tokens like `Member[x]`, `ReturnValue`, `Argument[n]`, `Parameter[n]` +- API graph paths can be verified by writing a CodeQL query that walks the API graph (see language-specific prompts) + +#### MaD (Models as Data) format (long tuples) + +Used by: **Java/Kotlin**, **C#**, **Go**, **C/C++** + +Tuples identify targets by **fully qualified package/namespace, type, method name, and signature**. Tuples are verbose (9-10 columns). + +```yaml +# sinkModel(package, type, subtypes, name, signature, ext, input, kind, provenance) โ€” 9 columns +- [ + 'java.sql', + 'Statement', + True, + 'execute', + '(String)', + '', + 'Argument[0]', + 'sql-injection', + 'manual' + ] + +# summaryModel(package, type, subtypes, name, signature, ext, input, output, kind, provenance) โ€” 10 columns +- [ + 'System', + 'String', + False, + 'Concat', + '(System.Object,System.Object)', + '', + 'Argument[0,1]', + 'ReturnValue', + 'taint', + 'manual' + ] + +# barrierModel(package, type, subtypes, name, signature, ext, output, kind, provenance) โ€” 9 columns +- ['java.io', 'File', True, 'getName', '()', '', 'ReturnValue', 'path-injection', 'manual'] + +# barrierGuardModel(package, type, subtypes, name, signature, ext, input, acceptingValue, kind, provenance) โ€” 10 columns +- [ + 'java.net', + 'URI', + True, + 'isAbsolute', + '()', + '', + 'Argument[this]', + 'false', + 'request-forgery', + 'manual' + ] +``` + +- The first 5 columns locate the callable: `package`/`namespace`, `type`, `subtypes` (bool), `name`, `signature` +- `subtypes: True` means the model applies to overrides/implementors +- `signature` uses fully qualified type names (Go always uses `""`) +- The `provenance` column (last) should be `"manual"` for hand-written models + +#### Quick reference + +| | API Graph | MaD | +| ------------------------- | ------------------------------------ | ----------------------------------------------------- | +| **Languages** | Python, Ruby, JS/TS | Java/Kotlin, C#, Go, C/C++ | +| **Pack name** | `codeql/-all` | `codeql/-all` | +| **Sink columns** | 3 (type, path, kind) | 9 | +| **Summary columns** | 5 | 10 | +| **Barrier columns** | 3 (type, path, kind) | 9 | +| **Barrier guard columns** | 4 (type, path, acceptingValue, kind) | 10 | +| **Target identification** | Access path navigation | Package + type + method + signature | +| **Pointer indirection** | N/A | C/C++ only: `Argument[*n]` | +| **Receiver access** | `Argument[self]` (Ruby/Python) | `Argument[this]` (Java/C#), `Argument[receiver]` (Go) | + +For detailed syntax and examples, see the language-specific data extension prompts. + +### Threat Models + +Threat models control which `sourceModel` entries are active during analysis. The `kind` column of a `sourceModel` determines its threat model category. + +#### Default behavior + +By default, only the **`remote`** threat model is enabled. This means only sources marked with `kind: "remote"` are active. To include local sources, you must explicitly enable additional threat models via `--threat-model` on the CLI or in the code scanning configuration. + +#### Categories + +**`remote`** (enabled by default) + +- Network requests and responses โ€” HTTP parameters, headers, request bodies, WebSocket messages, API responses +- This is the primary threat model for web-facing applications + +**`local`** (must be explicitly enabled) +Represents data from the local system. Subcategories can be enabled/disabled independently: + +| Subcategory | Description | Example | +| ------------------ | --------------------------------- | --------------------------- | +| `file` | Local file reads | `open("config.txt").read()` | +| `commandargs` | Command-line arguments | `sys.argv[1]` | +| `database` | Database query results | `cursor.fetchall()` | +| `environment` | Environment variables | `os.environ["KEY"]` | +| `stdin` | Standard input | `input()` | +| `windows-registry` | Windows registry values (C# only) | Registry.GetValue() | + +Enable selectively: `--threat-model commandargs --threat-model environment` enables only those two, not all of `local`. + +**Language-specific categories:** + +| Category | Description | Language | +| ------------------------ | ---------------------------------------------- | -------------------------- | +| `android` | External storage reads, ContentProvider params | Java/Kotlin only | +| `reverse-dns` | Reverse DNS lookups | Java only | +| `database-access-result` | Database access results | JavaScript only | +| `file-write` | Opening files in write mode | C# only | +| `view-component-input` | React/Vue/Angular component props | JavaScript/TypeScript only | + +#### Choosing a threat model for your source + +- Use `"remote"` for any data that arrives over the network โ€” this is the most common and is active by default +- Use specific `local` subcategories (e.g., `"file"`, `"commandargs"`) when modeling local input mechanisms โ€” be precise rather than using the generic `"local"` parent +- When in doubt, use `"remote"` โ€” it provides the broadest default coverage + +### Model Quality Criteria + +Your generated CodeQL models will be evaluated on: + +1. **Code Quality**: + - **Critical**: Extensions must be formatted without errors. Invalid extensions will fail the engine and have negative code quality. + - **Important**: Minimize warning-level diagnostics (deprecated elements, style guide deviations) + - **Best Practice**: Follow CodeQL naming conventions and idioms, provide comments with sensible organization + +### Common Pitfalls + +1. **Invalid definitions**: yaml models that do not match the defined format and have not been tested to be valid are not well trusted. + +### Development + +Access paths for data extensions are parsed using [shared/dataflow/codeql/dataflow/internal/AccessPathSyntax.qll](https://github.com/github/codeql/blob/main/shared/dataflow/codeql/dataflow/internal/AccessPathSyntax.qll) + +For languages that support API Graphs as the access paths can be most easily tested by: + +1. creating a small codeql database with some sample code that has a full end to end flow for the suspected query +2. writing/executing a sample codeql query using api graphs to verify with 100% certainty that the path to discover the suspected source/sink/summary is verified. + +To understand if APIGraphs are used by the language, it is best to evaluate the ModelsAsData.qll for the given language. + +- ex: [python/ql/lib/semmle/python/frameworks/data/ModelsAsData.qll](https://github.com/github/codeql/blob/main/python/ql/lib/semmle/python/frameworks/data/ModelsAsData.qll) for python imports ApiGraphModels and ApiGraphs + - [python/ql/lib/semmle/python/frameworks/data/internal/ApiGraphModels.qll](https://github.com/github/codeql/blob/main/python/ql/lib/semmle/python/frameworks/data/internal/ApiGraphModels.qll) dealing with flow models specified in extensible predicates. + - [python/ql/lib/semmle/python/frameworks/data/internal/ApiGraphModelsSpecific.qll](https://github.com/github/codeql/blob/main/python/ql/lib/semmle/python/frameworks/data/internal/ApiGraphModelsSpecific.qll) handles the Python-specific Member[x] tokens by calling node.getMember(x) on the API graph + +## CLI References + +Essential commands for query development: + +### Core Development Commands + +- [qlt query generate new-query](../../resources/cli/qlt/qlt_query_generate_new-query.prompt.md) - Generate scaffolding for a new CodeQL query with packs and tests +- [codeql query compile](../../resources/cli/codeql/codeql_query_compile.prompt.md) - Compile and validate query syntax +- [codeql query run](../../resources/cli/codeql/codeql_query_run.prompt.md) - Execute queries against databases +- [codeql execute query-server2](../../resources/cli/codeql/codeql_execute_query-server2.prompt.md) - Run a persistent query execution server for efficient multi-query workflows and IDE integrations +- [codeql query format](../../resources/cli/codeql/codeql_query_format.prompt.md) - Format query source code +- [codeql test run](../../resources/cli/codeql/codeql_test_run.prompt.md) - Execute query test suites +- [codeql test extract](../../resources/cli/codeql/codeql_test_extract.prompt.md) - Create test databases + +### Database Operations + +- [codeql database create](../../resources/cli/codeql/codeql_database_create.prompt.md) - Create CodeQL databases +- [codeql database analyze](../../resources/cli/codeql/codeql_database_analyze.prompt.md) - Run queries against databases + +### Package Management + +- [codeql pack install](../../resources/cli/codeql/codeql_pack_install.prompt.md) - Install query dependencies +- [codeql resolve library-path](../../resources/cli/codeql/codeql_resolve_library-path.prompt.md) - Resolve library paths + +### Results Analysis + +- [codeql bqrs decode](../../resources/cli/codeql/codeql_bqrs_decode.prompt.md) - Convert binary results to text +- [codeql bqrs info](../../resources/cli/codeql/codeql_bqrs_info.prompt.md) - Inspect result metadata + +### Model Pack / Data Extension Options + +During development, you'll typically test data extensions with a **single query** or **unit test** โ€” not `codeql database analyze` (which is for full analysis runs / CI). + +#### Running a single query with model packs + +Use `codeql query run` with `--model-packs` or `--additional-packs`: + +```bash +# Use a published model pack by name against a single query +codeql query run \ + --database=/path/to/db \ + --model-packs=my-org/my-model-pack \ + --output=results.bqrs \ + -- path/to/MyQuery.ql + +# Use a local (unpublished) model pack during development +codeql query run \ + --database=/path/to/db \ + --additional-packs=languages//custom/src \ + --output=results.bqrs \ + -- path/to/MyQuery.ql +``` + +#### Running unit tests with model packs + +`codeql test run` does **not** support `--model-packs`. Instead, data extensions are resolved through `qlpack.yml` configuration: + +1. The **model pack** declares `extensionTargets` and `dataExtensions` in its `qlpack.yml` +2. The **test pack** declares a dependency on the model pack in its `qlpack.yml` +3. Use `--additional-packs` to point the test runner at a local (unpublished) model pack: + +```bash +codeql test run \ + --additional-packs=languages//custom/src \ + --keep-databases \ + --show-extractor-output \ + -- languages///test// +``` + +#### Full option reference + +| Option | Available on | Purpose | +| ------------------------------------- | ---------------------------------------------------------------- | ---------------------------------------------------------------------------- | +| `--model-packs=` | `codeql query run`, `codeql database analyze` | Reference published model packs by name | +| `--additional-packs=[;...]` | `codeql query run`, `codeql test run`, `codeql database analyze` | Search local directories for packs (primary mechanism for local development) | +| `--no-database-extension-packs` | `codeql database analyze` | Omit extensions bundled into the database at creation time | +| `--no-database-threat-models` | `codeql database analyze` | Omit threat model config stored in the database | +| `--threat-model=` | `codeql database analyze` | Enable/disable threat model categories (e.g., `local`, `remote`, `all`) | + +## Related Resources + +- [Test-Driven QL Development](./test_driven_ql_development.prompt.md) - Comprehensive TDD workflow +- [Language-specific prompts](.) - Additional guidance for specific languages diff --git a/.github/prompts/go_data_extension_development.prompt.md b/.github/prompts/go_data_extension_development.prompt.md new file mode 100644 index 0000000..d2d053f --- /dev/null +++ b/.github/prompts/go_data_extension_development.prompt.md @@ -0,0 +1,294 @@ +--- +mode: agent +--- + +# Go Data Extension + +For general CodeQL data extension model development guidance, see [Common Data Extension Development](./data_extensions_development.prompt.md). +If you need to write a custom CodeQL query instead of a data extension, see [Common Query Development](./query_development.prompt.md). + +## Go-Specific Documentation + +### Documentation + +- [Customizing Library Models for Go](https://codeql.github.com/docs/codeql-language-guides/customizing-library-models-for-go/) + - Can also be found at [Customizing Library Models for Go Docs](https://github.com/github/codeql/blob/main/docs/codeql/codeql-language-guides/customizing-library-models-for-go.rst) + +### Model Format + +Go uses a **MaD (Models as Data)** format with **9-10 column tuples** that identify callables by package path, type, function name, and signature. Same structural pattern as Java/Kotlin and C#. + +The pack name is `codeql/go-all`. + +#### Extensible predicates + +| Predicate | Columns | Purpose | +| ------------------- | ------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------ | +| `sourceModel` | `(package, type, subtypes, name, signature, ext, output, kind, provenance)` | Model sources of tainted data | +| `sinkModel` | `(package, type, subtypes, name, signature, ext, input, kind, provenance)` | Model sinks | +| `summaryModel` | `(package, type, subtypes, name, signature, ext, input, output, kind, provenance)` | Model flow through functions | +| `barrierModel` | `(package, type, subtypes, name, signature, ext, output, kind, provenance)` | Model barriers (sanitizers) that stop taint flow | +| `barrierGuardModel` | `(package, type, subtypes, name, signature, ext, input, acceptingValue, kind, provenance)` | Model barrier guards (validators) that stop taint via conditional checks | +| `neutralModel` | `(package, type, name, signature, kind, provenance)` | Mark functions as having no dataflow impact | + +#### Tuple column reference + +| Column | Description | Example | +| ---------------- | --------------------------------------------------------------------------------- | -------------------------------- | +| `package` | Go package import path | `"database/sql"`, `"net/http"` | +| `type` | Receiver type name (leave `""` for free functions) | `"DB"`, `""` | +| `subtypes` | Whether model applies to embedded types / interface implementors (`True`/`False`) | `True` | +| `name` | Function or method name, or field name | `"Prepare"`, `"Body"` | +| `signature` | **Always `""` for Go** (Go does not use signature-based overload resolution) | `""` | +| `ext` | Leave empty (`""`) | `""` | +| `input`/`output` | Access path | `"Argument[0]"`, `"ReturnValue"` | +| `kind` | Source/sink/summary kind | `"sql-injection"`, `"taint"` | +| `provenance` | Origin of the model | `"manual"` | + +#### Important: Go-specific rules + +- **Signature is always `""`** โ€” Go does not have overloaded functions, so the signature column is unused +- **Free functions** have `type` = `""` and `subtypes` = `False` +- **`subtypes: True`** includes embedded types (promoted methods/fields) and interface implementors +- **Field access** is modeled as a source with an empty output access path: `output` = `""` +- **Multiple return values**: use `ReturnValue[0]`, `ReturnValue[1]`, etc. +- **Receiver access path**: use `Argument[receiver]` (not `Argument[this]`) + +### Access Paths + +| Component | Description | +| -------------------- | ----------------------------------------------------- | +| `Argument[n]` | Argument at index n (0-based) | +| `Argument[receiver]` | The receiver of a method call (`u` in `u.Hostname()`) | +| `Argument[n1..n2]` | Range of arguments | +| `Argument[*n]` | First indirection (pointer dereference) of argument n | +| `ReturnValue` | Return value (or first return value) | +| `ReturnValue[n]` | The nth return value (0-indexed) | +| `ArrayElement` | Elements of a slice/array | +| `MapKey` | Key of a map | +| `MapValue` | Value of a map | + +### Package Versioning + +- Go modules with major version > 1 include the version suffix in the import path (e.g., `github.com/example/pkg/v2`) +- **Omit the version suffix** in the `package` column to match **all versions** automatically +- To match only a specific major version, include the suffix: `"github.com/example/pkg/v2"` +- To match only v1 (no suffix), use the `fixed-version:` prefix: `"fixed-version:github.com/example/pkg"` +- For `gopkg.in` packages, the `.v2` suffix is also handled automatically + +### Package Grouping + +When the same package is available under multiple import paths, use the `packageGrouping` extensible predicate: + +```yaml +extensions: + - addsTo: + pack: codeql/go + extensible: packageGrouping + data: + - ['glog', 'github.com/golang/glog'] + - ['glog', 'gopkg.in/glog'] + + - addsTo: + pack: codeql/go + extensible: sinkModel + data: + - ['group:glog', '', False, 'Info', '', '', 'Argument[0]', 'log-injection', 'manual'] +``` + +### Sink Kinds + +`sql-injection`, `nosql-injection`, `command-injection`, `path-injection`, `url-redirection`, `log-injection`, `request-forgery`, `xpath-injection` + +### Sample Model + +Given a snippet where `db.Prepare(query)` is a SQL injection sink: + +```go +func Tainted(db *sql.DB, name string) { + stmt, err := db.Prepare("SELECT * FROM users WHERE name = " + name) // sink +} +``` + +`database_sql.model.yml` + +```yaml +extensions: + - addsTo: + pack: codeql/go-all + extensible: sourceModel + data: [] + + - addsTo: + pack: codeql/go-all + extensible: sinkModel + data: + - ['database/sql', 'DB', True, 'Prepare', '', '', 'Argument[0]', 'sql-injection', 'manual'] + + - addsTo: + pack: codeql/go-all + extensible: summaryModel + data: [] + + - addsTo: + pack: codeql/go-all + extensible: barrierModel + data: [] + + - addsTo: + pack: codeql/go-all + extensible: barrierGuardModel + data: [] + + - addsTo: + pack: codeql/go-all + extensible: neutralModel + data: [] +``` + +### Example: Source from HTTP Request Field + +Model `r.Body` as a remote source (field access with empty output path): + +```yaml +extensions: + - addsTo: + pack: codeql/go-all + extensible: sourceModel + data: + - ['net/http', 'Request', True, 'Body', '', '', '', 'remote', 'manual'] +``` + +Note: The output column is `""` (empty) because this models a **field access**, not a method call. + +### Example: Source from HTTP Method Return + +```yaml +extensions: + - addsTo: + pack: codeql/go-all + extensible: sourceModel + data: + - ['net/http', 'Request', True, 'FormValue', '', '', 'ReturnValue', 'remote', 'manual'] +``` + +### Example: Flow Through `strings.Join` + +```yaml +extensions: + - addsTo: + pack: codeql/go-all + extensible: summaryModel + data: + - ['strings', '', False, 'Join', '', '', 'Argument[0..1]', 'ReturnValue', 'taint', 'manual'] +``` + +Note: `Argument[0..1]` is shorthand for both `Argument[0]` and `Argument[1]`. + +### Example: Flow Through Method with Receiver + +```yaml +extensions: + - addsTo: + pack: codeql/go-all + extensible: summaryModel + data: + - [ + 'net/url', + 'URL', + True, + 'Hostname', + '', + '', + 'Argument[receiver]', + 'ReturnValue', + 'taint', + 'manual' + ] +``` + +Note: Go uses `Argument[receiver]` (not `Argument[this]`). + +### Example: Flow Through Variadic Function + +For variadic parameters `...T`, the parameter is treated as `[]T`. Access elements with nested `ArrayElement`: + +```yaml +extensions: + - addsTo: + pack: codeql/go-all + extensible: summaryModel + data: + - [ + 'slices', + '', + False, + 'Concat', + '', + '', + 'Argument[0].ArrayElement.ArrayElement', + 'ReturnValue.ArrayElement', + 'value', + 'manual' + ] +``` + +### Example: Barrier Using `Htmlquote` + +The `Htmlquote` function from the beego framework HTML-escapes a string, preventing HTML injection attacks. The return value is safe. + +```go +func Render(w http.ResponseWriter, r *http.Request) { + name := r.FormValue("name") + safe := beego.Htmlquote(name) // safe is HTML-escaped +} +``` + +```yaml +extensions: + - addsTo: + pack: codeql/go-all + extensible: barrierModel + data: + - ['group:beego', '', True, 'Htmlquote', '', '', 'ReturnValue', 'html-injection', 'manual'] +``` + +Note: The `group:` prefix matches multiple package paths that refer to the same package (configured via `packageGrouping`). The `kind` `"html-injection"` must match the sink kind used by XSS queries. + +### Example: Barrier Guard Using a Validation Function + +A barrier guard models a function returning a boolean indicating whether data is safe. When the function returns the expected value, taint flow is stopped through the guarded branch. + +```go +func Query(db *sql.DB, input string) { + if example.IsSafe(input) { // The check guards the query + db.Query(input) // Safe + } +} +``` + +```yaml +extensions: + - addsTo: + pack: codeql/go-all + extensible: barrierGuardModel + data: + - [ + 'example.com/example', + '', + False, + 'IsSafe', + '', + '', + 'Argument[0]', + 'true', + 'sql-injection', + 'manual' + ] +``` + +Note: The `acceptingValue` `"true"` means the barrier applies when `IsSafe` returns true. The `input` `"Argument[0]"` identifies the first argument whose taint flow is blocked. + +### Additional References + +- **[Go Reference](./go_query_development.prompt.md)** - Go query development diff --git a/.github/prompts/java_data_extension_development.prompt.md b/.github/prompts/java_data_extension_development.prompt.md new file mode 100644 index 0000000..3a03f16 --- /dev/null +++ b/.github/prompts/java_data_extension_development.prompt.md @@ -0,0 +1,294 @@ +--- +mode: agent +--- + +# Java / Kotlin Data Extension + +For general CodeQL data extension model development guidance, see [Common Data Extension Development](./data_extensions_development.prompt.md). +If you need to write a custom CodeQL query instead of a data extension, see [Common Query Development](./query_development.prompt.md). + +## Java/Kotlin-Specific Documentation + +### Documentation + +- [Customizing Library Models for Java and Kotlin](https://codeql.github.com/docs/codeql-language-guides/customizing-library-models-for-java-and-kotlin/) + - Can also be found at [Customizing Library Models for Java and Kotlin Docs](https://github.com/github/codeql/blob/main/docs/codeql/codeql-language-guides/customizing-library-models-for-java-and-kotlin.rst) + +- The VS Code CodeQL model editor provides a guided UI for creating Java/Kotlin models. See [Using the CodeQL model editor](https://docs.github.com/en/code-security/codeql-for-vs-code/using-the-advanced-functionality-of-the-codeql-for-vs-code-extension/using-the-codeql-model-editor). + +### Model Format + +Java/Kotlin uses a **MaD (Models as Data)** format with **9-10 column tuples** that identify callables by fully qualified package, type, method name, and signature. This is fundamentally different from the API Graph-based format used by Python, Ruby, and JavaScript. + +The pack name is `codeql/java-all`. + +#### Extensible predicates + +| Predicate | Columns | Purpose | +| ------------------- | ------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------ | +| `sourceModel` | `(package, type, subtypes, name, signature, ext, output, kind, provenance)` | Model sources of tainted data | +| `sinkModel` | `(package, type, subtypes, name, signature, ext, input, kind, provenance)` | Model sinks | +| `summaryModel` | `(package, type, subtypes, name, signature, ext, input, output, kind, provenance)` | Model flow through methods | +| `barrierModel` | `(package, type, subtypes, name, signature, ext, output, kind, provenance)` | Model barriers (sanitizers) that stop taint flow | +| `barrierGuardModel` | `(package, type, subtypes, name, signature, ext, input, acceptingValue, kind, provenance)` | Model barrier guards (validators) that stop taint via conditional checks | +| `neutralModel` | `(package, type, name, signature, kind, provenance)` | Mark methods as having no dataflow impact | + +#### Tuple column reference + +| Column | Description | Example | +| ---------------- | --------------------------------------------------- | -------------------------------- | +| `package` | Fully qualified package name | `"java.sql"` | +| `type` | Class or interface name | `"Statement"` | +| `subtypes` | Whether model applies to overrides (`True`/`False`) | `True` | +| `name` | Method name (constructors use the class name) | `"execute"` | +| `signature` | Method parameter type signature | `"(String)"` | +| `ext` | Leave empty (`""`) | `""` | +| `input`/`output` | Access path to the input/output of the flow | `"Argument[0]"`, `"ReturnValue"` | +| `kind` | Source/sink/summary kind | `"sql-injection"`, `"taint"` | +| `provenance` | Origin of the model | `"manual"` | + +#### Important: `subtypes` flag + +- `True` โ€” the model applies to the method **and all overrides** in subclasses/implementing classes +- `False` โ€” only applies to the exact class specified + +#### Important: `signature` column + +- Type names must be **fully qualified**: `"(String)"` means `java.lang.String` +- Multiple parameters: `"(String,int)"` +- Generic type parameters must match source: `"Select"` +- Empty `""` matches any signature (use sparingly) + +### Access Paths + +| Component | Description | +| ------------------ | --------------------------------------- | +| `Argument[n]` | Argument at index n (0-based) | +| `Argument[this]` | The qualifier/receiver of a method call | +| `Argument[n1..n2]` | Range of arguments | +| `ReturnValue` | Return value of the method | +| `Element` | Elements of a collection | +| `Field[name]` | Named field of a class | +| `Parameter[n]` | Parameter at index n of a callback | +| `MapKey` | Key of a map | +| `MapValue` | Value of a map | + +### Sink Kinds + +`sql-injection`, `command-injection`, `code-injection`, `path-injection`, `url-redirection`, `log-injection`, `request-forgery`, `xpath-injection`, `ldap-injection`, `jndi-injection`, `template-injection`, `hostname-verification` + +### Threat Models (Java-specific) + +In addition to `remote` and `local`, Java supports: + +- `android` (`android-external-storage-dir`, `contentprovider`) โ€” Android-specific sources +- `reverse-dns` โ€” reverse DNS lookups + +### Sample Model + +Given a snippet where `stmt.execute(query)` is a SQL injection sink: + +```java +public static void taintsink(Connection conn, String query) throws SQLException { + Statement stmt = conn.createStatement(); + stmt.execute(query); // sink: SQL injection +} +``` + +`jdbc.model.yml` + +```yaml +extensions: + - addsTo: + pack: codeql/java-all + extensible: sourceModel + data: [] + + - addsTo: + pack: codeql/java-all + extensible: sinkModel + data: + - [ + 'java.sql', + 'Statement', + True, + 'execute', + '(String)', + '', + 'Argument[0]', + 'sql-injection', + 'manual' + ] + + - addsTo: + pack: codeql/java-all + extensible: summaryModel + data: [] + + - addsTo: + pack: codeql/java-all + extensible: barrierModel + data: [] + + - addsTo: + pack: codeql/java-all + extensible: barrierGuardModel + data: [] + + - addsTo: + pack: codeql/java-all + extensible: neutralModel + data: [] +``` + +### Example: Source from Network Socket + +```yaml +extensions: + - addsTo: + pack: codeql/java-all + extensible: sourceModel + data: + - ['java.net', 'Socket', False, 'getInputStream', '()', '', 'ReturnValue', 'remote', 'manual'] +``` + +### Example: Flow Through `String.concat` + +```yaml +extensions: + - addsTo: + pack: codeql/java-all + extensible: summaryModel + data: + - [ + 'java.lang', + 'String', + False, + 'concat', + '(String)', + '', + 'Argument[this]', + 'ReturnValue', + 'taint', + 'manual' + ] + - [ + 'java.lang', + 'String', + False, + 'concat', + '(String)', + '', + 'Argument[0]', + 'ReturnValue', + 'taint', + 'manual' + ] +``` + +### Example: Flow Through Higher-Order Method `Stream.map` + +```yaml +extensions: + - addsTo: + pack: codeql/java-all + extensible: summaryModel + data: + - [ + 'java.util.stream', + 'Stream', + True, + 'map', + '(Function)', + '', + 'Argument[this].Element', + 'Argument[0].Parameter[0]', + 'value', + 'manual' + ] + - [ + 'java.util.stream', + 'Stream', + True, + 'map', + '(Function)', + '', + 'Argument[0].ReturnValue', + 'ReturnValue.Element', + 'value', + 'manual' + ] +``` + +Note: Two rows are needed โ€” one for flow into the lambda parameter, one for flow from the lambda return to the output stream elements. + +### Example: Neutral Model + +```yaml +extensions: + - addsTo: + pack: codeql/java-all + extensible: neutralModel + data: + - ['java.time', 'Instant', 'now', '()', 'summary', 'manual'] +``` + +### Example: Barrier for Path Injection + +The `File.getName()` method returns only the final component of a path, which protects against path injection vulnerabilities. + +```java +public static void barrier(File file) { + String name = file.getName(); // Only the filename, no directory traversal +} +``` + +```yaml +extensions: + - addsTo: + pack: codeql/java-all + extensible: barrierModel + data: + - ['java.io', 'File', True, 'getName', '()', '', 'ReturnValue', 'path-injection', 'manual'] +``` + +Note: The `kind` `"path-injection"` must match the sink kind used by path injection queries. `subtypes: True` ensures the model applies to subclasses of `File`. + +### Example: Barrier Guard for Request Forgery + +The `URI.isAbsolute()` method returns `false` when the URI is relative and therefore safe for request forgery because it cannot redirect to an external server. + +```java +public static void barrierguard(URI uri) throws IOException { + if (!uri.isAbsolute()) { // The check guards the request + URL url = uri.toURL(); + url.openConnection(); // Safe + } +} +``` + +```yaml +extensions: + - addsTo: + pack: codeql/java-all + extensible: barrierGuardModel + data: + - [ + 'java.net', + 'URI', + True, + 'isAbsolute', + '()', + '', + 'Argument[this]', + 'false', + 'request-forgery', + 'manual' + ] +``` + +Note: The `acceptingValue` `"false"` means the barrier applies when `isAbsolute` returns false (the URI is relative). The `input` `"Argument[this]"` identifies the qualifier (`uri`) whose taint flow is blocked. + +### Additional References + +- **[Java Reference](./java_query_development.prompt.md)** - Java/Kotlin query development diff --git a/.github/prompts/javascript_data_extension_development.prompt.md b/.github/prompts/javascript_data_extension_development.prompt.md new file mode 100644 index 0000000..0abf823 --- /dev/null +++ b/.github/prompts/javascript_data_extension_development.prompt.md @@ -0,0 +1,223 @@ +--- +mode: agent +--- + +# JavaScript / TypeScript Data Extension + +For general CodeQL data extension model development guidance, see [Common Data Extension Development](./data_extensions_development.prompt.md). +If you need to write a custom CodeQL query instead of a data extension, see [Common Query Development](./query_development.prompt.md). + +## JavaScript/TypeScript-Specific Documentation + +### Documentation + +- [Customizing Library Models for JavaScript](https://codeql.github.com/docs/codeql-language-guides/customizing-library-models-for-javascript/) + - Can also be found at [Customizing Library Models for JavaScript Docs](https://github.com/github/codeql/blob/main/docs/codeql/codeql-language-guides/customizing-library-models-for-javascript.rst) + +### Model Format + +JavaScript/TypeScript uses an **API Graph-based** model format with short tuples โ€” similar to Python and Ruby. + +The pack name is `codeql/javascript-all`. + +#### Extensible predicates + +| Predicate | Columns | Purpose | +| ------------------- | ------------------------------------ | ------------------------------------------------------------------------ | +| `sourceModel` | `(type, path, kind)` | Model sources of tainted data | +| `sinkModel` | `(type, path, kind)` | Model sinks where tainted data is used vulnerably | +| `summaryModel` | `(type, path, input, output, kind)` | Model flow through function calls | +| `barrierModel` | `(type, path, kind)` | Model barriers (sanitizers) that stop taint flow | +| `barrierGuardModel` | `(type, path, acceptingValue, kind)` | Model barrier guards (validators) that stop taint via conditional checks | +| `typeModel` | `(type1, type2, path)` | Define type relationships | + +#### Type column (first column) + +The `type` column identifies a starting point for access path evaluation: + +- **NPM package name** (e.g., `"execa"`, `"mysql"`) โ€” matches imports of that package. If the package name has dots, surround with single quotes: `'lodash.escape'`. +- **`"global"`** โ€” matches the global object (window). Use this to access global variables/functions like `eval`, `decodeURIComponent`. +- **Qualified type** `"."` (e.g., `"mysql.Connection"`) โ€” matches expressions known to be instances of that type (via type annotations or `typeModel` definitions). + +### Access Paths + +Access paths are `.`-separated, evaluated left to right: + +| Component | Description | +| ----------------- | ------------------------------------------------------------------------------------- | +| `Member[name]` | Property access with the given name | +| `AnyMember` | Any property regardless of name | +| `Argument[n]` | Argument at index n | +| `Argument[this]` | The receiver of a method call | +| `Parameter[n]` | Parameter at index n | +| `Parameter[this]` | The `this` parameter of a function | +| `ReturnValue` | Return value of a function or call | +| `ArrayElement` | An element of an array | +| `MapValue` | A value of a map object | +| `Awaited` | The value of a resolved promise | +| `Instance` | Instances of a class (including subclasses) | +| `Fuzzy` | All values derived from the current value (approximate, useful for complex libraries) | + +**Call site filters** (select a subset of calls matching criteria): + +| Component | Description | +| ----------------------------- | ------------------------------------------------ | +| `WithArity[n]` | Calls with exactly n arguments | +| `WithStringArgument[n=value]` | Calls where argument n is string literal `value` | + +**Decorator components:** + +| Component | Description | +| -------------------- | ------------------------------------------------------ | +| `DecoratedClass` | A class decorated by the current value | +| `DecoratedParameter` | A parameter decorated by the current value | +| `DecoratedMember` | A method/field/accessor decorated by the current value | + +**Middleware component:** + +| Component | Description | +| --------------------- | ------------------------------------------------ | +| `GuardedRouteHandler` | Route handlers guarded by the current middleware | + +**Syntax notes:** + +- Multiple operands: `Member[foo,bar]` matches either `foo` or `bar` +- Numeric intervals: `Argument[0..2]` matches arguments 0, 1, or 2 +- Last argument: `Argument[N-1]`, second-to-last: `Argument[N-2]` + +### Sink Kinds + +`code-injection`, `command-injection`, `path-injection`, `sql-injection`, `nosql-injection`, `html-injection`, `request-forgery`, `url-redirection`, `unsafe-deserialization`, `log-injection` + +### Threat Models (JS-specific) + +In addition to `remote` and `local`, JavaScript supports: + +- `database-access-result` โ€” data from database reads +- `view-component-input` โ€” inputs to React/Vue/Angular components (props) + +### Sample Model + +Given a snippet using the `execa` package: + +```javascript +import { shell } from 'execa' +shell(cmd) // sink: command injection +``` + +`execa.model.yml` + +```yaml +extensions: + - addsTo: + pack: codeql/javascript-all + extensible: sinkModel + data: + - ['execa', 'Member[shell].Argument[0]', 'command-injection'] +``` + +### Example: Source from Window Message Events + +```yaml +extensions: + - addsTo: + pack: codeql/javascript-all + extensible: sourceModel + data: + - [ + 'global', + 'Member[addEventListener].WithStringArgument[0=message].Argument[1].Parameter[0].Member[data]', + 'remote' + ] +``` + +Note the use of `WithStringArgument[0=message]` to restrict to only `"message"` event listeners. + +### Example: Using Fuzzy Models + +When a library is complex and precise modeling is difficult, `Fuzzy` approximates all values derived from a package: + +```yaml +extensions: + - addsTo: + pack: codeql/javascript-all + extensible: sinkModel + data: + - ['mysql', 'Fuzzy.Member[query].Argument[0]', 'sql-injection'] +``` + +### Example: typeModel for Untyped Code + +When code lacks type annotations, use `typeModel` to define that a function returns an instance of a known type: + +```yaml +extensions: + - addsTo: + pack: codeql/javascript-all + extensible: typeModel + data: + - ['mysql.Connection', '@example/db', 'Member[getConnection].ReturnValue'] +``` + +### Example: Summary with GuardedRouteHandler + +Model a middleware that injects tainted data on `req.data`: + +```yaml +extensions: + - addsTo: + pack: codeql/javascript-all + extensible: sourceModel + data: + - [ + '@example/middleware', + 'Member[injectData].ReturnValue.GuardedRouteHandler.Parameter[0].Member[data]', + 'remote' + ] +``` + +### Example: Barrier Using `encodeURIComponent` + +The `encodeURIComponent` function encodes a string for safe use in URLs, preventing HTML injection when the result is used in HTML contexts. + +```javascript +let escaped = encodeURIComponent(input) // Safe for XSS +document.body.innerHTML = escaped +``` + +```yaml +extensions: + - addsTo: + pack: codeql/javascript-all + extensible: barrierModel + data: + - ['global', 'Member[encodeURIComponent].ReturnValue', 'html-injection'] +``` + +Note: The `type` `"global"` starts at the global object. The `path` navigates to the return value of `encodeURIComponent`. The `kind` `"html-injection"` must match the sink kind used by XSS queries. + +### Example: Barrier Guard Using a Validation Function + +A barrier guard models a function that returns a boolean indicating whether data is safe. When the function returns the expected value, taint flow is stopped through the guarded branch. + +```javascript +if (isValid(userInput)) { + // The check guards the use + db.query(userInput) // Safe +} +``` + +```yaml +extensions: + - addsTo: + pack: codeql/javascript-all + extensible: barrierGuardModel + data: + - ['my-package', 'Member[isValid].Argument[0]', 'true', 'sql-injection'] +``` + +Note: The `acceptingValue` `"true"` means the barrier applies when `isValid` returns true. The `path` `"Member[isValid].Argument[0]"` identifies the value being validated (the first argument). + +### Additional References + +- **[JavaScript Reference](./javascript_query_development.prompt.md)** - JavaScript/TypeScript query development diff --git a/.github/prompts/python_data_extension_development.prompt.md b/.github/prompts/python_data_extension_development.prompt.md new file mode 100644 index 0000000..4745428 --- /dev/null +++ b/.github/prompts/python_data_extension_development.prompt.md @@ -0,0 +1,173 @@ +--- +mode: agent +--- + +# Python Data Extension + +For general CodeQL data extension model development guidance, see [Common Data Extension Development](./data_extensions_development.prompt.md). +If you need to write a custom CodeQL query instead of a data extension, see [Common Query Development](./query_development.prompt.md). + +## Python-Specific Documentation + +### Python Documentation + +- [Customizing Library Models for Python](https://codeql.github.com/docs/codeql-language-guides/customizing-library-models-for-python/) + - Can also be found at [Customizing Library Models for Python Docs](https://github.com/github/codeql/blob/main/docs/codeql/codeql-language-guides/customizing-library-models-for-python.rst) + +- [Using API graphs in Python](https://codeql.github.com/docs/codeql-language-guides/using-api-graphs-in-python/) - the access paths input to the extension tuple are powered by API graphs + +### API Graphs + +- [python/ql/lib/semmle/python/frameworks/data/ModelsAsData.qll](https://github.com/github/codeql/blob/main/python/ql/lib/semmle/python/frameworks/data/ModelsAsData.qll) for python imports ApiGraphModels and ApiGraphs + - [python/ql/lib/semmle/python/frameworks/data/internal/ApiGraphModels.qll](https://github.com/github/codeql/blob/main/python/ql/lib/semmle/python/frameworks/data/internal/ApiGraphModels.qll) dealing with flow models specified in extensible predicates. + - [python/ql/lib/semmle/python/frameworks/data/internal/ApiGraphModelsSpecific.qll](https://github.com/github/codeql/blob/main/python/ql/lib/semmle/python/frameworks/data/internal/ApiGraphModelsSpecific.qll) handles the Python-specific Member[x] tokens by calling node.getMember(x) on the API graph + +Ex query that could test out the API Graphs for the given database to ensure a proper path is built: + +```codeql +import python +import semmle.python.ApiGraphs + +from API::CallNode call +where + call = API::moduleImport("re").getMember("compile").getACall() and + call.getParameter(0, "pattern") = + API::moduleImport("argparse") + .getMember("ArgumentParser") + .getReturn() + .getMember("parse_args") + .getMember(_) +select call +``` + +The parsing works as follows: + +1. `AccessPathSyntax.qll` tokenizes the path "Member[sql].Member[connect].ReturnValue.Member[cursor].ReturnValue.Member[execute].Argument[0]" into individual tokens: + +- Member[sql] +- Member[connect] +- ReturnValue +- Member[cursor] +- ReturnValue +- Member[execute] +- Argument[0] + +2. `ApiGraphModels.qll` uses getNodeFromPath() to recursively resolve each token starting from the "databricks" type +3. `ApiGraphModelsSpecific.qll` handles the Python-specific Member[x] tokens by calling node.getMember(x) on the API graph + +### Sample Model + +Given this sample snippet (that would need to be a full piece of code to test this codeql extension) + +```python +from flask import Flask, request +import databricks.sql as dbsql + +app = Flask(__name__) + +@app.get("/q") +def q(): + s = request.args["s"] # remote user input + query = "SELECT * FROM users WHERE name='" + s + "'" # user controls SQL text + + with dbsql.connect(server_hostname="HOST", http_path="HTTP_PATH", access_token="TOKEN") as conn: + with conn.cursor() as cursor: + cursor.execute(query) # sink we want to model with the data extension + return str(cursor.fetchall()) +``` + +This is a sample model that extends the `sql-injection` sinkModel to find instances of `cursor.execute()` as vulnerable. + +`databricks.model.yml` + +```yaml +extensions: + - addsTo: + pack: codeql/python-all + extensible: sourceModel + data: [] + + - addsTo: + pack: codeql/python-all + extensible: sinkModel + data: + # Using API graphs modeling works: + - [ + 'databricks', + 'Member[sql].Member[connect].ReturnValue.Member[cursor].ReturnValue.Member[execute].Argument[0]', + 'sql-injection' + ] + - addsTo: + pack: codeql/python-all + extensible: summaryModel + data: [] + + - addsTo: + pack: codeql/python-all + extensible: barrierModel + data: [] + + - addsTo: + pack: codeql/python-all + extensible: barrierGuardModel + data: [] + + - addsTo: + pack: codeql/python-all + extensible: neutralModel + data: [] + + - addsTo: + pack: codeql/python-all + extensible: typeModel + data: [] +``` + +### Example: Barrier Using `html.escape` + +The `html.escape` function HTML-escapes a string, preventing HTML injection (XSS) attacks. + +```python +import html +escaped = html.escape(unknown) # Safe for XSS +``` + +```yaml +extensions: + - addsTo: + pack: codeql/python-all + extensible: barrierModel + data: + - ['html', 'Member[escape].ReturnValue', 'html-injection'] +``` + +Note: The `type` `"html"` starts at the `html` module import. The `path` navigates to the return value of `escape`. The `kind` `"html-injection"` must match the sink kind used by XSS queries. + +### Example: Barrier Guard Using Django URL Validation + +The `url_has_allowed_host_and_scheme` function from Django validates that a URL is safe for redirects. + +```python +if url_has_allowed_host_and_scheme(url, allowed_hosts=...): + redirect(url) # Safe +``` + +```yaml +extensions: + - addsTo: + pack: codeql/python-all + extensible: barrierGuardModel + data: + - [ + 'django', + 'Member[utils].Member[http].Member[url_has_allowed_host_and_scheme].Argument[0,url:]', + 'true', + 'url-redirection' + ] +``` + +Note: The `acceptingValue` `"true"` means the barrier applies when the function returns true. `Argument[0,url:]` matches either the first positional argument or the keyword argument `url`. + +### Additional References + +- **[Python Reference](./python_query_development.prompt.md)** - Python query development diff --git a/.github/prompts/ruby_data_extension_development.prompt.md b/.github/prompts/ruby_data_extension_development.prompt.md new file mode 100644 index 0000000..8065809 --- /dev/null +++ b/.github/prompts/ruby_data_extension_development.prompt.md @@ -0,0 +1,212 @@ +--- +mode: agent +--- + +# Ruby Data Extension + +For general CodeQL data extension model development guidance, see [Common Data Extension Development](./data_extensions_development.prompt.md). +If you need to write a custom CodeQL query instead of a data extension, see [Common Query Development](./query_development.prompt.md). + +## Ruby-Specific Documentation + +### Ruby Documentation + +- [Customizing Library Models for Ruby](https://codeql.github.com/docs/codeql-language-guides/customizing-library-models-for-ruby/) + - Can also be found at [Customizing Library Models for Ruby Docs](https://github.com/github/codeql/blob/main/docs/codeql/codeql-language-guides/customizing-library-models-for-ruby.rst) + +- [Using API graphs in Ruby](https://codeql.github.com/docs/codeql-language-guides/using-api-graphs-in-ruby/) - access paths in extensions are powered by API graphs + +### Model Format + +Ruby uses an **API Graph-based** model format with short tuples โ€” similar to Python and JavaScript. + +The pack name is `codeql/ruby-all`. + +#### Extensible predicates + +| Predicate | Columns | Purpose | +| ------------------- | ------------------------------------ | ------------------------------------------------------------------------ | +| `sourceModel` | `(type, path, kind)` | Model sources of tainted data | +| `sinkModel` | `(type, path, kind)` | Model sinks where tainted data is used vulnerably | +| `summaryModel` | `(type, path, input, output, kind)` | Model flow through method calls | +| `barrierModel` | `(type, path, kind)` | Model barriers (sanitizers) that stop taint flow | +| `barrierGuardModel` | `(type, path, acceptingValue, kind)` | Model barrier guards (validators) that stop taint via conditional checks | +| `typeModel` | `(type1, type2, path)` | Define type relationships | + +#### Type column + +The `type` column identifies a starting point for access path evaluation: + +- A class name like `"TTY::Command"` matches instances of that class +- Appending `!` (e.g., `"Sinatra::Base!"`) matches references to the **class itself** rather than instances +- `typeModel` rows can define aliases so that subtypes inherit all models from a parent type + +### Access Paths + +Access paths are `.`-separated, evaluated left to right: + +| Component | Description | +| ----------------------- | ------------------------------------------------------- | +| `Method[name]` | Calls to the named method | +| `Argument[n]` | Argument at index n | +| `Argument[name:]` | Keyword argument with the given name | +| `Argument[self]` | The receiver of a method call | +| `Argument[block]` | The block argument | +| `Argument[any]` | Any argument (except self/block) | +| `Argument[any-named]` | Any keyword argument | +| `Argument[hash-splat]` | All keyword arguments (`**kwargs`) | +| `Parameter[n]` | Parameter at index n | +| `Parameter[name:]` | Keyword parameter with the given name | +| `Parameter[self]` | The self parameter | +| `Parameter[block]` | The block parameter | +| `Parameter[any]` | Any parameter (except self/block) | +| `Parameter[any-named]` | Any keyword parameter | +| `Parameter[hash-splat]` | Hash splat parameter | +| `ReturnValue` | Return value of a call | +| `Element[any]` | Any element of an array or hash | +| `Element[n]` | Array element at the given index | +| `Element[key]` | Hash element at the given key | +| `Field[@name]` | Instance variable with the given name | +| `Fuzzy` | All values derived from the current value (approximate) | + +**Syntax notes:** + +- Multiple operands: `Method[foo,bar]` matches calls to either `foo` or `bar` +- Numeric ranges: `Argument[1..]` matches all arguments from index 1 onward + +### Sink Kinds + +`code-injection`, `command-injection`, `path-injection`, `sql-injection`, `url-redirection`, `log-injection` + +### Sample Model + +Given a snippet using the `tty-command` gem: + +```ruby +tty = TTY::Command.new +tty.run(cmd) # sink: command injection +``` + +`tty_command.model.yml` + +```yaml +extensions: + - addsTo: + pack: codeql/ruby-all + extensible: sourceModel + data: [] + + - addsTo: + pack: codeql/ruby-all + extensible: sinkModel + data: + - ['TTY::Command', 'Method[run].Argument[0]', 'command-injection'] + + - addsTo: + pack: codeql/ruby-all + extensible: summaryModel + data: [] + + - addsTo: + pack: codeql/ruby-all + extensible: barrierModel + data: [] + + - addsTo: + pack: codeql/ruby-all + extensible: barrierGuardModel + data: [] + + - addsTo: + pack: codeql/ruby-all + extensible: typeModel + data: [] +``` + +### Example: Flow Through a Method + +Model flow through `URI.decode_uri_component`: + +```yaml +extensions: + - addsTo: + pack: codeql/ruby-all + extensible: summaryModel + data: + - ['URI!', 'Method[decode_uri_component]', 'Argument[0]', 'ReturnValue', 'taint'] +``` + +Note: `URI!` with the `!` suffix matches the class itself (not instances), since `decode_uri_component` is a class method. + +### Example: Source from Block Parameters + +Model `x` in a Sinatra route block as a remote source: + +```yaml +extensions: + - addsTo: + pack: codeql/ruby-all + extensible: sourceModel + data: + - ['Sinatra::Base!', 'Method[get].Argument[block].Parameter[0]', 'remote'] +``` + +### Example: typeModel for Subclass Inheritance + +When `Mysql2::EM::Client` is a subclass of `Mysql2::Client`, add a type model so all parent models apply: + +```yaml +extensions: + - addsTo: + pack: codeql/ruby-all + extensible: typeModel + data: + - ['Mysql2::Client', 'Mysql2::EM::Client', ''] +``` + +### Example: Barrier Using `Mysql2::Client#escape` + +The `escape` method on `Mysql2::Client` escapes special characters in a string for use in SQL statements, preventing SQL injection. + +```ruby +client = Mysql2::Client.new +escaped = client.escape(input) # Safe for SQL injection +client.query("SELECT * FROM users WHERE name = '#{escaped}'") +``` + +```yaml +extensions: + - addsTo: + pack: codeql/ruby-all + extensible: barrierModel + data: + - ['Mysql2::Client', 'Method[escape].ReturnValue', 'sql-injection'] +``` + +Note: The `type` `"Mysql2::Client"` matches instances of the class. The `kind` `"sql-injection"` must match the sink kind used by SQL injection queries. + +### Example: Barrier Guard Using a Validation Method + +A barrier guard models a method that returns a boolean indicating whether data is safe. When the method returns the expected value, taint flow is stopped through the guarded branch. + +```ruby +if Validator.is_safe(user_input) + # The check guards the use, so the input is safe. + client.query("SELECT * FROM users WHERE name = '#{user_input}'") +end +``` + +```yaml +extensions: + - addsTo: + pack: codeql/ruby-all + extensible: barrierGuardModel + data: + - ['Validator!', 'Method[is_safe].Argument[0]', 'true', 'sql-injection'] +``` + +Note: The `!` suffix on `"Validator!"` matches the class itself (not instances), since `is_safe` is a class method. The `acceptingValue` `"true"` means the barrier applies when `is_safe` returns true. + +### Additional References + +- **[Ruby Reference](./ruby_query_development.prompt.md)** - Ruby query development