Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions NEXT_CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
* Remove API enum values and types that are still in development from the `databricks-bundles` Python package; these were never accepted by the backend ([#5484](https://github.com/databricks/cli/pull/5484)).
* direct: Fix resolving a resource reference that is used more than once within the same field ([#5558](https://github.com/databricks/cli/pull/5558)).
* Bundle variable references now accept Unicode letters in path segments (e.g. `${var.变量}`). ([#5532](https://github.com/databricks/cli/pull/5532))
* Ignore remote changes for vector search direct_access_index_spec.schema_json to prevent drift when the backend normalizes the schema ([#5481](https://github.com/databricks/cli/pull/5481)).

### Dependency updates

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ resources:
primary_key: id
index_type: DIRECT_ACCESS
direct_access_index_spec:
schema_json: '{"id":"integer","vector":"array<float>"}'
schema_json: '{"id":"int","vector":"array<float>"}'
Comment thread
janniklasrose marked this conversation as resolved.
embedding_vector_columns:
- name: vector
embedding_dimension: 768
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
"endpoint_type": "STANDARD"
}

>>> [CLI] vector-search-indexes create-index --json {"name":"main.default.test_vs_index_[UNIQUE_NAME]","endpoint_name":"test-vs-endpoint-[UNIQUE_NAME]","primary_key":"id","index_type":"DIRECT_ACCESS","direct_access_index_spec":{"schema_json":"{\"id\":\"integer\",\"vector\":\"array<float>\"}","embedding_vector_columns":[{"name":"vector","embedding_dimension":768}]}}
>>> [CLI] vector-search-indexes create-index --json {"name":"main.default.test_vs_index_[UNIQUE_NAME]","endpoint_name":"test-vs-endpoint-[UNIQUE_NAME]","primary_key":"id","index_type":"DIRECT_ACCESS","direct_access_index_spec":{"schema_json":"{\"id\":\"int\",\"vector\":\"array<float>\"}","embedding_vector_columns":[{"name":"vector","embedding_dimension":768}]}}
{
"name": "main.default.test_vs_index_[UNIQUE_NAME]",
"endpoint_name": "test-vs-endpoint-[UNIQUE_NAME]",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ trap cleanup EXIT

trace $CLI vector-search-endpoints create-endpoint "${ENDPOINT_NAME}" STANDARD | jq '{name, endpoint_type}'

trace $CLI vector-search-indexes create-index --json "{\"name\":\"${INDEX_NAME}\",\"endpoint_name\":\"${ENDPOINT_NAME}\",\"primary_key\":\"id\",\"index_type\":\"DIRECT_ACCESS\",\"direct_access_index_spec\":{\"schema_json\":\"{\\\"id\\\":\\\"integer\\\",\\\"vector\\\":\\\"array<float>\\\"}\",\"embedding_vector_columns\":[{\"name\":\"vector\",\"embedding_dimension\":768}]}}" | jq '{name, endpoint_name, index_type, primary_key}'
trace $CLI vector-search-indexes create-index --json "{\"name\":\"${INDEX_NAME}\",\"endpoint_name\":\"${ENDPOINT_NAME}\",\"primary_key\":\"id\",\"index_type\":\"DIRECT_ACCESS\",\"direct_access_index_spec\":{\"schema_json\":\"{\\\"id\\\":\\\"int\\\",\\\"vector\\\":\\\"array<float>\\\"}\",\"embedding_vector_columns\":[{\"name\":\"vector\",\"embedding_dimension\":768}]}}" | jq '{name, endpoint_name, index_type, primary_key}'

trace $CLI bundle deployment bind index1 "${INDEX_NAME}" --auto-approve

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ Plan: 1 to add, 0 to change, 1 to delete, 1 unchanged
"name": "vector"
}
],
"schema_json": "{\"id\":\"integer\",\"vector\":\"array\u003cfloat\u003e\"}"
"schema_json": "{\"id\":\"int\",\"vector\":\"array\u003cfloat\u003e\"}"
},
"endpoint_name": "vs-endpoint-[UNIQUE_NAME]",
"endpoint_uuid": "[UUID]",
Expand All @@ -82,6 +82,13 @@ Plan: 1 to add, 0 to change, 1 to delete, 1 unchanged
}
},
"changes": {
"direct_access_index_spec.schema_json": {
"action": "skip",
"reason": "normalized_by_backend",
"old": "{\"id\":\"integer\",\"vector\":\"array\u003cfloat\u003e\"}",
"new": "{\"id\":\"integer\",\"vector\":\"array\u003cfloat\u003e\"}",
"remote": "{\"id\":\"int\",\"vector\":\"array\u003cfloat\u003e\"}"
},
"endpoint_uuid": {
"action": "skip",
"reason": "state-only field",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
bundle:
name: vs-index-schema-$UNIQUE_NAME

sync:
paths: []

resources:
vector_search_endpoints:
my_endpoint:
name: vs-endpoint-$UNIQUE_NAME
endpoint_type: STANDARD
vector_search_indexes:
my_index:
name: main.default.vs_index_$UNIQUE_NAME
endpoint_name: ${resources.vector_search_endpoints.my_endpoint.name}
primary_key: id
index_type: DIRECT_ACCESS
direct_access_index_spec:
schema_json: '{"id":"integer","count":"long","small":"short","tiny":"byte","tags":"array<integer>","score":"float","label":"string","vector":"array<float>"}'
embedding_vector_columns:
- name: vector
embedding_dimension: 768

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@

>>> [CLI] bundle deploy
Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/vs-index-schema-[UNIQUE_NAME]/default/files...
Deploying resources...
Updating deployment state...
Deployment complete!

>>> [CLI] vector-search-indexes get-index main.default.vs_index_[UNIQUE_NAME]
{"count":"bigint","id":"int","label":"string","score":"float","small":"smallint","tags":"array<int>","tiny":"tinyint","vector":"array<float>"}

>>> [CLI] bundle plan
Plan: 0 to add, 0 to change, 0 to delete, 2 unchanged

>>> [CLI] bundle destroy --auto-approve
The following resources will be deleted:
delete resources.vector_search_endpoints.my_endpoint
delete resources.vector_search_indexes.my_index

This action will result in the deletion of the following Vector Search indexes.
For Delta Sync indexes, the source Delta Table is preserved but the embedding pipeline is removed.
For Direct Access indexes, all upserted vectors are permanently lost:
delete resources.vector_search_indexes.my_index

All files and directories at the following location will be deleted: /Workspace/Users/[USERNAME]/.bundle/vs-index-schema-[UNIQUE_NAME]/default

Deleting files...
Destroy complete!
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
envsubst < databricks.yml.tmpl > databricks.yml

cleanup() {
trace $CLI bundle destroy --auto-approve
rm -f out.requests.txt
}
trap cleanup EXIT

trace $CLI bundle deploy

# The backend (and the test server) rewrite the schema on create, so
# get-index returns Spark type names and sorted keys, not the config literal.
index_name="main.default.vs_index_${UNIQUE_NAME}"
trace $CLI vector-search-indexes get-index "${index_name}" | jq -r '.direct_access_index_spec.schema_json'

# Re-plan must be a no-op: remote changes to schema_json are ignored
# (ignore_remote_changes), so the immutable spec does not plan a recreate.
trace $CLI bundle plan
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Cloud = false
10 changes: 10 additions & 0 deletions bundle/direct/dresources/resources.yml
Original file line number Diff line number Diff line change
Expand Up @@ -638,6 +638,16 @@ resources:
reason: immutable
- field: direct_access_index_spec
reason: immutable
ignore_remote_changes:
# The backend rewrites schema_json on create: user-facing type names
# ("integer", "long", "short", "byte") are stored in Unity Catalog as
# Spark type names ("int", "bigint", "smallint", "tinyint") and the
# columns come back in sorted key order, so GET never echoes the user's
# literal input. Without this rule the rewrite reads as a change to the
# immutable direct_access_index_spec and plans a destructive recreate
# that drops all upserted vectors.
- field: direct_access_index_spec.schema_json
reason: normalized_by_backend
backend_defaults:
# The Vector Search API assigns index_subtype when the config omits it
- field: index_subtype
61 changes: 61 additions & 0 deletions libs/testserver/vector_search_indexes.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package testserver

import (
"bytes"
"encoding/json"
"fmt"
"net/http"
Expand Down Expand Up @@ -70,6 +71,14 @@ func (s *FakeWorkspace) VectorSearchIndexCreate(req Request) Response {
indexSubtype = vectorsearch.IndexSubtypeHybrid
}

// The backend rewrites schema_json on create: user-facing type names are
// stored as Spark type names (e.g. "integer" -> "int") and the columns are
// returned in sorted key order rather than the user's original order.
// Mirror that here so the create -> get round-trip matches the real API.
if createReq.DirectAccessIndexSpec != nil {
createReq.DirectAccessIndexSpec.SchemaJson = normalizeSchemaJSON(createReq.DirectAccessIndexSpec.SchemaJson)
}

index := fakeVectorSearchIndex{
VectorIndex: vectorsearch.VectorIndex{
Creator: s.CurrentUser().UserName,
Expand Down Expand Up @@ -110,6 +119,58 @@ func isValidIndexName(name string) bool {
return true
}

// normalizeSchemaJSON rewrites a schema_json document the way the backend
// stores it: user-facing column type names are folded to Spark type names and
// the columns are re-serialized in sorted key order (encoding/json sorts map
// keys, matching the backend). Returns the input unchanged when it isn't the
// expected {"column":"type"} JSON object.
func normalizeSchemaJSON(schemaJSON string) string {
if schemaJSON == "" {
return schemaJSON
}
var schema map[string]string
if err := json.Unmarshal([]byte(schemaJSON), &schema); err != nil {
return schemaJSON
}
for column, columnType := range schema {
schema[column] = normalizeColumnType(columnType)
}
// Disable HTML escaping so array<...> keeps its angle brackets verbatim
// rather than being rewritten to < / >.
var buf bytes.Buffer
enc := json.NewEncoder(&buf)
enc.SetEscapeHTML(false)
if err := enc.Encode(schema); err != nil {
return schemaJSON
}
return strings.TrimRight(buf.String(), "\n")
}

// normalizeColumnType maps the user-facing column type names the Vector
// Search API accepts ("integer", "long", "short", "byte") to the Spark type
// names Unity Catalog stores and GET returns, recursing into array element
// types. Types whose user-facing and Spark spellings coincide ("float",
// "string", ...) pass through unchanged.
func normalizeColumnType(columnType string) string {
if inner, ok := strings.CutPrefix(columnType, "array<"); ok {
if elem, ok := strings.CutSuffix(inner, ">"); ok {
return "array<" + normalizeColumnType(elem) + ">"
}
}
switch columnType {
case "integer":
return "int"
case "long":
return "bigint"
case "short":
return "smallint"
case "byte":
return "tinyint"
default:
return columnType
}
}

// remapDeltaSyncSpec converts a request spec to a response spec.
func remapDeltaSyncSpec(req *vectorsearch.DeltaSyncVectorIndexSpecRequest) *vectorsearch.DeltaSyncVectorIndexSpecResponse {
if req == nil {
Expand Down
51 changes: 51 additions & 0 deletions libs/testserver/vector_search_indexes_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
package testserver

import (
"testing"

"github.com/stretchr/testify/assert"
)

func TestNormalizeSchemaJSON(t *testing.T) {
tests := []struct {
name string
in string
want string
}{
{
name: "user-facing type stored as Spark type name",
in: `{"id":"integer","vector":"array<float>"}`,
want: `{"id":"int","vector":"array<float>"}`,
},
{
name: "all integer-family names",
in: `{"a":"long","b":"short","c":"byte"}`,
want: `{"a":"bigint","b":"smallint","c":"tinyint"}`,
},
{
name: "array element type is mapped",
in: `{"tags":"array<integer>"}`,
want: `{"tags":"array<int>"}`,
},
{
name: "matching spellings pass through and keys are sorted",
in: `{"y":"float","x":"string","z":"int"}`,
want: `{"x":"string","y":"float","z":"int"}`,
},
{
name: "empty input",
in: "",
want: "",
},
{
name: "non-object input is returned unchanged",
in: "not json",
want: "not json",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
assert.Equal(t, tt.want, normalizeSchemaJSON(tt.in))
})
}
}
Loading