diff --git a/NEXT_CHANGELOG.md b/NEXT_CHANGELOG.md index f26e1ea1afc..9c5ce923dd5 100644 --- a/NEXT_CHANGELOG.md +++ b/NEXT_CHANGELOG.md @@ -11,6 +11,7 @@ * Remove API enum values and types that are still in development from the `databricks-bundles` Python package; these were never accepted by the backend ([#5484](https://github.com/databricks/cli/pull/5484)). * direct: Fix resolving a resource reference that is used more than once within the same field ([#5558](https://github.com/databricks/cli/pull/5558)). * Bundle variable references now accept Unicode letters in path segments (e.g. `${var.变量}`). ([#5532](https://github.com/databricks/cli/pull/5532)) +* Ignore remote changes for vector search direct_access_index_spec.schema_json to prevent drift when the backend normalizes the schema ([#5481](https://github.com/databricks/cli/pull/5481)). ### Dependency updates diff --git a/acceptance/bundle/deployment/bind/vector_search_index/databricks.yml.tmpl b/acceptance/bundle/deployment/bind/vector_search_index/databricks.yml.tmpl index 29692b4450c..e057c575ed8 100644 --- a/acceptance/bundle/deployment/bind/vector_search_index/databricks.yml.tmpl +++ b/acceptance/bundle/deployment/bind/vector_search_index/databricks.yml.tmpl @@ -12,7 +12,7 @@ resources: primary_key: id index_type: DIRECT_ACCESS direct_access_index_spec: - schema_json: '{"id":"integer","vector":"array"}' + schema_json: '{"id":"int","vector":"array"}' embedding_vector_columns: - name: vector embedding_dimension: 768 diff --git a/acceptance/bundle/deployment/bind/vector_search_index/output.txt b/acceptance/bundle/deployment/bind/vector_search_index/output.txt index f1b79186906..a91168a5137 100644 --- a/acceptance/bundle/deployment/bind/vector_search_index/output.txt +++ b/acceptance/bundle/deployment/bind/vector_search_index/output.txt @@ -5,7 +5,7 @@ "endpoint_type": "STANDARD" } ->>> [CLI] vector-search-indexes create-index --json {"name":"main.default.test_vs_index_[UNIQUE_NAME]","endpoint_name":"test-vs-endpoint-[UNIQUE_NAME]","primary_key":"id","index_type":"DIRECT_ACCESS","direct_access_index_spec":{"schema_json":"{\"id\":\"integer\",\"vector\":\"array\"}","embedding_vector_columns":[{"name":"vector","embedding_dimension":768}]}} +>>> [CLI] vector-search-indexes create-index --json {"name":"main.default.test_vs_index_[UNIQUE_NAME]","endpoint_name":"test-vs-endpoint-[UNIQUE_NAME]","primary_key":"id","index_type":"DIRECT_ACCESS","direct_access_index_spec":{"schema_json":"{\"id\":\"int\",\"vector\":\"array\"}","embedding_vector_columns":[{"name":"vector","embedding_dimension":768}]}} { "name": "main.default.test_vs_index_[UNIQUE_NAME]", "endpoint_name": "test-vs-endpoint-[UNIQUE_NAME]", diff --git a/acceptance/bundle/deployment/bind/vector_search_index/script b/acceptance/bundle/deployment/bind/vector_search_index/script index 3d07efacf93..16f36a496fd 100644 --- a/acceptance/bundle/deployment/bind/vector_search_index/script +++ b/acceptance/bundle/deployment/bind/vector_search_index/script @@ -11,7 +11,7 @@ trap cleanup EXIT trace $CLI vector-search-endpoints create-endpoint "${ENDPOINT_NAME}" STANDARD | jq '{name, endpoint_type}' -trace $CLI vector-search-indexes create-index --json "{\"name\":\"${INDEX_NAME}\",\"endpoint_name\":\"${ENDPOINT_NAME}\",\"primary_key\":\"id\",\"index_type\":\"DIRECT_ACCESS\",\"direct_access_index_spec\":{\"schema_json\":\"{\\\"id\\\":\\\"integer\\\",\\\"vector\\\":\\\"array\\\"}\",\"embedding_vector_columns\":[{\"name\":\"vector\",\"embedding_dimension\":768}]}}" | jq '{name, endpoint_name, index_type, primary_key}' +trace $CLI vector-search-indexes create-index --json "{\"name\":\"${INDEX_NAME}\",\"endpoint_name\":\"${ENDPOINT_NAME}\",\"primary_key\":\"id\",\"index_type\":\"DIRECT_ACCESS\",\"direct_access_index_spec\":{\"schema_json\":\"{\\\"id\\\":\\\"int\\\",\\\"vector\\\":\\\"array\\\"}\",\"embedding_vector_columns\":[{\"name\":\"vector\",\"embedding_dimension\":768}]}}" | jq '{name, endpoint_name, index_type, primary_key}' trace $CLI bundle deployment bind index1 "${INDEX_NAME}" --auto-approve diff --git a/acceptance/bundle/resources/vector_search_indexes/recreate/with_endpoint/output.txt b/acceptance/bundle/resources/vector_search_indexes/recreate/with_endpoint/output.txt index 20c139225ed..deb3c925c09 100644 --- a/acceptance/bundle/resources/vector_search_indexes/recreate/with_endpoint/output.txt +++ b/acceptance/bundle/resources/vector_search_indexes/recreate/with_endpoint/output.txt @@ -69,7 +69,7 @@ Plan: 1 to add, 0 to change, 1 to delete, 1 unchanged "name": "vector" } ], - "schema_json": "{\"id\":\"integer\",\"vector\":\"array\u003cfloat\u003e\"}" + "schema_json": "{\"id\":\"int\",\"vector\":\"array\u003cfloat\u003e\"}" }, "endpoint_name": "vs-endpoint-[UNIQUE_NAME]", "endpoint_uuid": "[UUID]", @@ -82,6 +82,13 @@ Plan: 1 to add, 0 to change, 1 to delete, 1 unchanged } }, "changes": { + "direct_access_index_spec.schema_json": { + "action": "skip", + "reason": "normalized_by_backend", + "old": "{\"id\":\"integer\",\"vector\":\"array\u003cfloat\u003e\"}", + "new": "{\"id\":\"integer\",\"vector\":\"array\u003cfloat\u003e\"}", + "remote": "{\"id\":\"int\",\"vector\":\"array\u003cfloat\u003e\"}" + }, "endpoint_uuid": { "action": "skip", "reason": "state-only field", diff --git a/acceptance/bundle/resources/vector_search_indexes/schema_normalization/databricks.yml.tmpl b/acceptance/bundle/resources/vector_search_indexes/schema_normalization/databricks.yml.tmpl new file mode 100644 index 00000000000..d4725692991 --- /dev/null +++ b/acceptance/bundle/resources/vector_search_indexes/schema_normalization/databricks.yml.tmpl @@ -0,0 +1,22 @@ +bundle: + name: vs-index-schema-$UNIQUE_NAME + +sync: + paths: [] + +resources: + vector_search_endpoints: + my_endpoint: + name: vs-endpoint-$UNIQUE_NAME + endpoint_type: STANDARD + vector_search_indexes: + my_index: + name: main.default.vs_index_$UNIQUE_NAME + endpoint_name: ${resources.vector_search_endpoints.my_endpoint.name} + primary_key: id + index_type: DIRECT_ACCESS + direct_access_index_spec: + schema_json: '{"id":"integer","count":"long","small":"short","tiny":"byte","tags":"array","score":"float","label":"string","vector":"array"}' + embedding_vector_columns: + - name: vector + embedding_dimension: 768 diff --git a/acceptance/bundle/resources/vector_search_indexes/schema_normalization/out.test.toml b/acceptance/bundle/resources/vector_search_indexes/schema_normalization/out.test.toml new file mode 100644 index 00000000000..48203e833cd --- /dev/null +++ b/acceptance/bundle/resources/vector_search_indexes/schema_normalization/out.test.toml @@ -0,0 +1,5 @@ +Local = true +Cloud = true +CloudSlow = true +RequiresUnityCatalog = true +EnvMatrix.DATABRICKS_BUNDLE_ENGINE = ["direct"] diff --git a/acceptance/bundle/resources/vector_search_indexes/schema_normalization/output.txt b/acceptance/bundle/resources/vector_search_indexes/schema_normalization/output.txt new file mode 100644 index 00000000000..6e46bc6c8bf --- /dev/null +++ b/acceptance/bundle/resources/vector_search_indexes/schema_normalization/output.txt @@ -0,0 +1,27 @@ + +>>> [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/vs-index-schema-[UNIQUE_NAME]/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! + +>>> [CLI] vector-search-indexes get-index main.default.vs_index_[UNIQUE_NAME] +{"count":"bigint","id":"int","label":"string","score":"float","small":"smallint","tags":"array","tiny":"tinyint","vector":"array"} + +>>> [CLI] bundle plan +Plan: 0 to add, 0 to change, 0 to delete, 2 unchanged + +>>> [CLI] bundle destroy --auto-approve +The following resources will be deleted: + delete resources.vector_search_endpoints.my_endpoint + delete resources.vector_search_indexes.my_index + +This action will result in the deletion of the following Vector Search indexes. +For Delta Sync indexes, the source Delta Table is preserved but the embedding pipeline is removed. +For Direct Access indexes, all upserted vectors are permanently lost: + delete resources.vector_search_indexes.my_index + +All files and directories at the following location will be deleted: /Workspace/Users/[USERNAME]/.bundle/vs-index-schema-[UNIQUE_NAME]/default + +Deleting files... +Destroy complete! diff --git a/acceptance/bundle/resources/vector_search_indexes/schema_normalization/script b/acceptance/bundle/resources/vector_search_indexes/schema_normalization/script new file mode 100644 index 00000000000..d7a56a8ff22 --- /dev/null +++ b/acceptance/bundle/resources/vector_search_indexes/schema_normalization/script @@ -0,0 +1,18 @@ +envsubst < databricks.yml.tmpl > databricks.yml + +cleanup() { + trace $CLI bundle destroy --auto-approve + rm -f out.requests.txt +} +trap cleanup EXIT + +trace $CLI bundle deploy + +# The backend (and the test server) rewrite the schema on create, so +# get-index returns Spark type names and sorted keys, not the config literal. +index_name="main.default.vs_index_${UNIQUE_NAME}" +trace $CLI vector-search-indexes get-index "${index_name}" | jq -r '.direct_access_index_spec.schema_json' + +# Re-plan must be a no-op: remote changes to schema_json are ignored +# (ignore_remote_changes), so the immutable spec does not plan a recreate. +trace $CLI bundle plan diff --git a/acceptance/bundle/resources/vector_search_indexes/schema_normalization/test.toml b/acceptance/bundle/resources/vector_search_indexes/schema_normalization/test.toml new file mode 100644 index 00000000000..18b1a88417e --- /dev/null +++ b/acceptance/bundle/resources/vector_search_indexes/schema_normalization/test.toml @@ -0,0 +1 @@ +Cloud = false diff --git a/bundle/direct/dresources/resources.yml b/bundle/direct/dresources/resources.yml index 5a017a4b24d..9c12da9f39b 100644 --- a/bundle/direct/dresources/resources.yml +++ b/bundle/direct/dresources/resources.yml @@ -638,6 +638,16 @@ resources: reason: immutable - field: direct_access_index_spec reason: immutable + ignore_remote_changes: + # The backend rewrites schema_json on create: user-facing type names + # ("integer", "long", "short", "byte") are stored in Unity Catalog as + # Spark type names ("int", "bigint", "smallint", "tinyint") and the + # columns come back in sorted key order, so GET never echoes the user's + # literal input. Without this rule the rewrite reads as a change to the + # immutable direct_access_index_spec and plans a destructive recreate + # that drops all upserted vectors. + - field: direct_access_index_spec.schema_json + reason: normalized_by_backend backend_defaults: # The Vector Search API assigns index_subtype when the config omits it - field: index_subtype diff --git a/libs/testserver/vector_search_indexes.go b/libs/testserver/vector_search_indexes.go index 15105682cf7..abf90080200 100644 --- a/libs/testserver/vector_search_indexes.go +++ b/libs/testserver/vector_search_indexes.go @@ -1,6 +1,7 @@ package testserver import ( + "bytes" "encoding/json" "fmt" "net/http" @@ -70,6 +71,14 @@ func (s *FakeWorkspace) VectorSearchIndexCreate(req Request) Response { indexSubtype = vectorsearch.IndexSubtypeHybrid } + // The backend rewrites schema_json on create: user-facing type names are + // stored as Spark type names (e.g. "integer" -> "int") and the columns are + // returned in sorted key order rather than the user's original order. + // Mirror that here so the create -> get round-trip matches the real API. + if createReq.DirectAccessIndexSpec != nil { + createReq.DirectAccessIndexSpec.SchemaJson = normalizeSchemaJSON(createReq.DirectAccessIndexSpec.SchemaJson) + } + index := fakeVectorSearchIndex{ VectorIndex: vectorsearch.VectorIndex{ Creator: s.CurrentUser().UserName, @@ -110,6 +119,58 @@ func isValidIndexName(name string) bool { return true } +// normalizeSchemaJSON rewrites a schema_json document the way the backend +// stores it: user-facing column type names are folded to Spark type names and +// the columns are re-serialized in sorted key order (encoding/json sorts map +// keys, matching the backend). Returns the input unchanged when it isn't the +// expected {"column":"type"} JSON object. +func normalizeSchemaJSON(schemaJSON string) string { + if schemaJSON == "" { + return schemaJSON + } + var schema map[string]string + if err := json.Unmarshal([]byte(schemaJSON), &schema); err != nil { + return schemaJSON + } + for column, columnType := range schema { + schema[column] = normalizeColumnType(columnType) + } + // Disable HTML escaping so array<...> keeps its angle brackets verbatim + // rather than being rewritten to < / >. + var buf bytes.Buffer + enc := json.NewEncoder(&buf) + enc.SetEscapeHTML(false) + if err := enc.Encode(schema); err != nil { + return schemaJSON + } + return strings.TrimRight(buf.String(), "\n") +} + +// normalizeColumnType maps the user-facing column type names the Vector +// Search API accepts ("integer", "long", "short", "byte") to the Spark type +// names Unity Catalog stores and GET returns, recursing into array element +// types. Types whose user-facing and Spark spellings coincide ("float", +// "string", ...) pass through unchanged. +func normalizeColumnType(columnType string) string { + if inner, ok := strings.CutPrefix(columnType, "array<"); ok { + if elem, ok := strings.CutSuffix(inner, ">"); ok { + return "array<" + normalizeColumnType(elem) + ">" + } + } + switch columnType { + case "integer": + return "int" + case "long": + return "bigint" + case "short": + return "smallint" + case "byte": + return "tinyint" + default: + return columnType + } +} + // remapDeltaSyncSpec converts a request spec to a response spec. func remapDeltaSyncSpec(req *vectorsearch.DeltaSyncVectorIndexSpecRequest) *vectorsearch.DeltaSyncVectorIndexSpecResponse { if req == nil { diff --git a/libs/testserver/vector_search_indexes_test.go b/libs/testserver/vector_search_indexes_test.go new file mode 100644 index 00000000000..6df73895fde --- /dev/null +++ b/libs/testserver/vector_search_indexes_test.go @@ -0,0 +1,51 @@ +package testserver + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestNormalizeSchemaJSON(t *testing.T) { + tests := []struct { + name string + in string + want string + }{ + { + name: "user-facing type stored as Spark type name", + in: `{"id":"integer","vector":"array"}`, + want: `{"id":"int","vector":"array"}`, + }, + { + name: "all integer-family names", + in: `{"a":"long","b":"short","c":"byte"}`, + want: `{"a":"bigint","b":"smallint","c":"tinyint"}`, + }, + { + name: "array element type is mapped", + in: `{"tags":"array"}`, + want: `{"tags":"array"}`, + }, + { + name: "matching spellings pass through and keys are sorted", + in: `{"y":"float","x":"string","z":"int"}`, + want: `{"x":"string","y":"float","z":"int"}`, + }, + { + name: "empty input", + in: "", + want: "", + }, + { + name: "non-object input is returned unchanged", + in: "not json", + want: "not json", + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert.Equal(t, tt.want, normalizeSchemaJSON(tt.in)) + }) + } +}