From 77f038f5235367112f0537a48c9788a89b8eb24d Mon Sep 17 00:00:00 2001 From: DuckDB Labs GitHub Bot Date: Wed, 1 Jul 2026 16:07:41 +0000 Subject: [PATCH 1/2] Update vendored DuckDB sources to 3cb65aa794 --- src/duckdb/extension/icu/icu-datefunc.cpp | 2 +- src/duckdb/extension/icu/icu-datesub.cpp | 32 ++++++++++++------- .../function/table/version/pragma_version.cpp | 6 ++-- .../common/multi_file/multi_file_states.hpp | 1 - .../include/duckdb/main/extension_entries.hpp | 3 +- .../duckdb/parser/peg/inlined_grammar.hpp | 2 +- .../peg/transformer/peg_transformer.hpp | 4 +-- .../transformer/peg_transformer_factory.cpp | 9 ++++-- .../peg/transformer/transform_expression.cpp | 8 ++--- .../peg/transformer/transform_generated.cpp | 6 ++-- .../parser/peg/transformer/transform_load.cpp | 2 +- 11 files changed, 45 insertions(+), 30 deletions(-) diff --git a/src/duckdb/extension/icu/icu-datefunc.cpp b/src/duckdb/extension/icu/icu-datefunc.cpp index 9e673d37d..8a6699dc4 100644 --- a/src/duckdb/extension/icu/icu-datefunc.cpp +++ b/src/duckdb/extension/icu/icu-datefunc.cpp @@ -176,7 +176,7 @@ uint64_t ICUDateFunc::SetTimeNS(icu::Calendar *calendar, timestamp_tz_ns_t date) int64_t nanos = date.value % Interval::NANOS_PER_MSEC; if (nanos < 0) { --millis; - nanos += Interval::MICROS_PER_MSEC; + nanos += Interval::NANOS_PER_MSEC; } const auto udate = UDate(millis); diff --git a/src/duckdb/extension/icu/icu-datesub.cpp b/src/duckdb/extension/icu/icu-datesub.cpp index 98f903359..df2b3cb02 100644 --- a/src/duckdb/extension/icu/icu-datesub.cpp +++ b/src/duckdb/extension/icu/icu-datesub.cpp @@ -187,20 +187,29 @@ ICUDateFunc::part_sub_t ICUDateFunc::SubtractFactory(DatePartSpecifier type) { // MS-SQL differences can be computed using ICU by truncating both arguments // to the desired part precision and then applying ICU subtraction/difference struct ICUCalendarDiff : public ICUDateFunc { - template - static int64_t DifferenceFunc(icu::Calendar *calendar, timestamp_tz_t start_date, timestamp_tz_t end_date, - part_trunc_t trunc_func, part_sub_t sub_func) { - // Truncate the two arguments. This is safe because we will stay in range - auto micros = SetTime(calendar, start_date); + static timestamp_tz_t TruncateForDiff(icu::Calendar *calendar, timestamp_tz_t date, part_trunc_t trunc_func) { + auto micros = SetTime(calendar, date); trunc_func(calendar, micros); - start_date = GetTimeUnsafe(calendar, micros); + return GetTimeUnsafe(calendar, micros); + } - micros = SetTime(calendar, end_date); + static timestamp_tz_t TruncateForDiff(icu::Calendar *calendar, timestamp_tz_ns_t date, part_trunc_t trunc_func) { + auto nanos = SetTimeNS(calendar, date); + // Adapt TIMESTAMPTZ_NS to the existing microsecond-or-coarser date_diff path. + uint64_t micros = nanos / Interval::NANOS_PER_MICRO; trunc_func(calendar, micros); - end_date = GetTimeUnsafe(calendar, micros); + return GetTimeUnsafe(calendar, micros); + } + + template + static int64_t DifferenceFunc(icu::Calendar *calendar, T start_date, T end_date, part_trunc_t trunc_func, + part_sub_t sub_func) { + // Truncate the two arguments. This is safe because we will stay in range + auto start_micros = TruncateForDiff(calendar, start_date, trunc_func); + auto end_micros = TruncateForDiff(calendar, end_date, trunc_func); // Now use ICU difference - return sub_func(calendar, start_date, end_date); + return sub_func(calendar, start_micros, end_micros); } static part_trunc_t DiffTruncationFactory(DatePartSpecifier type) { @@ -238,7 +247,7 @@ struct ICUCalendarDiff : public ICUDateFunc { BinaryExecutor::Execute( startdate_arg, enddate_arg, result, [&](T start_date, T end_date) -> optional { if (start_date.IsFinite() && end_date.IsFinite()) { - return DifferenceFunc(calendar, start_date, end_date, trunc_func, sub_func); + return DifferenceFunc(calendar, start_date, end_date, trunc_func, sub_func); } else { return nullopt; } @@ -252,7 +261,7 @@ struct ICUCalendarDiff : public ICUDateFunc { const auto part = GetDatePartSpecifier(specifier.GetString()); auto trunc_func = DiffTruncationFactory(part); auto sub_func = SubtractFactory(part); - return DifferenceFunc(calendar, start_date, end_date, trunc_func, sub_func); + return DifferenceFunc(calendar, start_date, end_date, trunc_func, sub_func); } else { return nullopt; } @@ -268,6 +277,7 @@ struct ICUCalendarDiff : public ICUDateFunc { static void AddFunctions(const Identifier &name, ExtensionLoader &loader) { ScalarFunctionSet set {name}; set.AddFunction(GetFunction(LogicalType::TIMESTAMP_TZ)); + set.AddFunction(GetFunction(LogicalType::TIMESTAMP_TZ_NS)); set.SetArgProperties(1, ArgProperties().NonIncreasing()); set.SetArgProperties(2, ArgProperties().NonDecreasing()); loader.RegisterFunction(set); diff --git a/src/duckdb/src/function/table/version/pragma_version.cpp b/src/duckdb/src/function/table/version/pragma_version.cpp index fec273829..27d0e236e 100644 --- a/src/duckdb/src/function/table/version/pragma_version.cpp +++ b/src/duckdb/src/function/table/version/pragma_version.cpp @@ -1,5 +1,5 @@ #ifndef DUCKDB_PATCH_VERSION -#define DUCKDB_PATCH_VERSION "0-dev10007" +#define DUCKDB_PATCH_VERSION "0-dev10027" #endif #ifndef DUCKDB_MINOR_VERSION #define DUCKDB_MINOR_VERSION 6 @@ -8,10 +8,10 @@ #define DUCKDB_MAJOR_VERSION 1 #endif #ifndef DUCKDB_VERSION -#define DUCKDB_VERSION "v1.6.0-dev10007" +#define DUCKDB_VERSION "v1.6.0-dev10027" #endif #ifndef DUCKDB_SOURCE_ID -#define DUCKDB_SOURCE_ID "2daa4fc9a4" +#define DUCKDB_SOURCE_ID "3cb65aa794" #endif #include "duckdb/function/table/system_functions.hpp" #include "duckdb/main/database.hpp" diff --git a/src/duckdb/src/include/duckdb/common/multi_file/multi_file_states.hpp b/src/duckdb/src/include/duckdb/common/multi_file/multi_file_states.hpp index 45fd80712..52a6c2941 100644 --- a/src/duckdb/src/include/duckdb/common/multi_file/multi_file_states.hpp +++ b/src/duckdb/src/include/duckdb/common/multi_file/multi_file_states.hpp @@ -12,7 +12,6 @@ #include "duckdb/common/multi_file/multi_file_options.hpp" #include "duckdb/common/multi_file/base_file_reader.hpp" #include "duckdb/common/multi_file/multi_file_list.hpp" -#include "duckdb/common/windows_undefs.hpp" #include "duckdb/execution/expression_executor.hpp" namespace duckdb { diff --git a/src/duckdb/src/include/duckdb/main/extension_entries.hpp b/src/duckdb/src/include/duckdb/main/extension_entries.hpp index fb0f755c4..a6a6f516a 100644 --- a/src/duckdb/src/include/duckdb/main/extension_entries.hpp +++ b/src/duckdb/src/include/duckdb/main/extension_entries.hpp @@ -462,7 +462,6 @@ static constexpr ExtensionFunctionEntry EXTENSION_FUNCTIONS[] = { {"list_value", "core_functions", CatalogType::SCALAR_FUNCTION_ENTRY}, {"listagg", "core_functions", CatalogType::AGGREGATE_FUNCTION_ENTRY}, {"ln", "core_functions", CatalogType::SCALAR_FUNCTION_ENTRY}, - {"load_aws_credentials", "aws", CatalogType::TABLE_FUNCTION_ENTRY}, {"log", "core_functions", CatalogType::SCALAR_FUNCTION_ENTRY}, {"log10", "core_functions", CatalogType::SCALAR_FUNCTION_ENTRY}, {"log2", "core_functions", CatalogType::SCALAR_FUNCTION_ENTRY}, @@ -870,6 +869,7 @@ static constexpr ExtensionFunctionOverloadEntry EXTENSION_FUNCTION_OVERLOADS[] = {"date_diff", "core_functions", CatalogType::SCALAR_FUNCTION_ENTRY, "[VARCHAR,TIME,TIME]>BIGINT"}, {"date_diff", "core_functions", CatalogType::SCALAR_FUNCTION_ENTRY, "[VARCHAR,TIMESTAMP,TIMESTAMP]>BIGINT"}, {"date_diff", "icu", CatalogType::SCALAR_FUNCTION_ENTRY, "[VARCHAR,TIMESTAMPTZ,TIMESTAMPTZ]>BIGINT"}, + {"date_diff", "icu", CatalogType::SCALAR_FUNCTION_ENTRY, "[VARCHAR,TIMESTAMPTZ_NS,TIMESTAMPTZ_NS]>BIGINT"}, {"date_part", "core_functions", CatalogType::SCALAR_FUNCTION_ENTRY, "['VARCHAR[]',DATE]>STRUCT"}, {"date_part", "core_functions", CatalogType::SCALAR_FUNCTION_ENTRY, "['VARCHAR[]',INTERVAL]>STRUCT"}, {"date_part", "core_functions", CatalogType::SCALAR_FUNCTION_ENTRY, "['VARCHAR[]',TIME]>STRUCT"}, @@ -896,6 +896,7 @@ static constexpr ExtensionFunctionOverloadEntry EXTENSION_FUNCTION_OVERLOADS[] = {"datediff", "core_functions", CatalogType::SCALAR_FUNCTION_ENTRY, "[VARCHAR,TIME,TIME]>BIGINT"}, {"datediff", "core_functions", CatalogType::SCALAR_FUNCTION_ENTRY, "[VARCHAR,TIMESTAMP,TIMESTAMP]>BIGINT"}, {"datediff", "icu", CatalogType::SCALAR_FUNCTION_ENTRY, "[VARCHAR,TIMESTAMPTZ,TIMESTAMPTZ]>BIGINT"}, + {"datediff", "icu", CatalogType::SCALAR_FUNCTION_ENTRY, "[VARCHAR,TIMESTAMPTZ_NS,TIMESTAMPTZ_NS]>BIGINT"}, {"datepart", "core_functions", CatalogType::SCALAR_FUNCTION_ENTRY, "['VARCHAR[]',DATE]>STRUCT"}, {"datepart", "core_functions", CatalogType::SCALAR_FUNCTION_ENTRY, "['VARCHAR[]',INTERVAL]>STRUCT"}, {"datepart", "core_functions", CatalogType::SCALAR_FUNCTION_ENTRY, "['VARCHAR[]',TIME]>STRUCT"}, diff --git a/src/duckdb/src/include/duckdb/parser/peg/inlined_grammar.hpp b/src/duckdb/src/include/duckdb/parser/peg/inlined_grammar.hpp index e7ca08f50..368f7621b 100644 --- a/src/duckdb/src/include/duckdb/parser/peg/inlined_grammar.hpp +++ b/src/duckdb/src/include/duckdb/parser/peg/inlined_grammar.hpp @@ -1008,7 +1008,7 @@ const char INLINED_PEG_GRAMMAR[] = { "NullIfExpression <- 'NULLIF' Parens(NullIfArguments)\n" "NullIfArguments <- Expression ',' Expression\n" "PositionExpression <- 'POSITION' Parens(PositionArguments)\n" - "PositionArguments <- SingleExpression 'IN' SingleExpression\n" + "PositionArguments <- OtherOperatorExpression 'IN' Expression\n" "RowExpression <- 'ROW' Parens(List(Expression)?)\n" "SubstringExpression <- 'SUBSTRING' Parens(SubstringArguments)\n" "SubstringArguments <- SubstringParameters / SubstringExpressionList\n" diff --git a/src/duckdb/src/include/duckdb/parser/peg/transformer/peg_transformer.hpp b/src/duckdb/src/include/duckdb/parser/peg/transformer/peg_transformer.hpp index 1e37aade9..e331b2e45 100644 --- a/src/duckdb/src/include/duckdb/parser/peg/transformer/peg_transformer.hpp +++ b/src/duckdb/src/include/duckdb/parser/peg/transformer/peg_transformer.hpp @@ -2802,8 +2802,8 @@ class PEGTransformerFactory { static unique_ptr TransformPositionArgumentsInternal(PEGTransformer &transformer, ParseResult &parse_result); static vector> - TransformPositionArguments(PEGTransformer &transformer, unique_ptr single_expression, - unique_ptr single_expression_1); + TransformPositionArguments(PEGTransformer &transformer, unique_ptr other_operator_expression, + unique_ptr expression); static unique_ptr TransformRowExpressionInternal(PEGTransformer &transformer, ParseResult &parse_result); static unique_ptr diff --git a/src/duckdb/src/parser/peg/transformer/peg_transformer_factory.cpp b/src/duckdb/src/parser/peg/transformer/peg_transformer_factory.cpp index 48785361d..73e622bad 100644 --- a/src/duckdb/src/parser/peg/transformer/peg_transformer_factory.cpp +++ b/src/duckdb/src/parser/peg/transformer/peg_transformer_factory.cpp @@ -371,10 +371,15 @@ bool PEGTransformerFactory::ConstructConstantFromExpression(const ParsedExpressi return false; } + auto cast_type = UnboundType::TryDefaultBind(cast.TargetType()); + if (cast_type == LogicalType::INVALID || cast_type == LogicalTypeId::UNBOUND) { + return false; + } + string error_message; - if (!dummy_value.DefaultTryCastAs(cast.TargetType(), value, &error_message)) { + if (!dummy_value.DefaultTryCastAs(cast_type, value, &error_message)) { throw ConversionException("Unable to cast %s to %s", dummy_value.ToString(), - EnumUtil::ToString(cast.TargetType().id())); + EnumUtil::ToString(cast_type.id())); } return true; } diff --git a/src/duckdb/src/parser/peg/transformer/transform_expression.cpp b/src/duckdb/src/parser/peg/transformer/transform_expression.cpp index c32cd1ea8..814bee59e 100644 --- a/src/duckdb/src/parser/peg/transformer/transform_expression.cpp +++ b/src/duckdb/src/parser/peg/transformer/transform_expression.cpp @@ -2405,11 +2405,11 @@ PEGTransformerFactory::TransformPositionExpression(PEGTransformer &transformer, vector> PEGTransformerFactory::TransformPositionArguments(PEGTransformer &transformer, - unique_ptr single_expression, - unique_ptr single_expression_1) { + unique_ptr other_operator_expression, + unique_ptr expression) { vector> result; - result.push_back(std::move(single_expression_1)); - result.push_back(std::move(single_expression)); + result.push_back(std::move(expression)); + result.push_back(std::move(other_operator_expression)); return result; } diff --git a/src/duckdb/src/parser/peg/transformer/transform_generated.cpp b/src/duckdb/src/parser/peg/transformer/transform_generated.cpp index 95b7610a3..08084d1c0 100644 --- a/src/duckdb/src/parser/peg/transformer/transform_generated.cpp +++ b/src/duckdb/src/parser/peg/transformer/transform_generated.cpp @@ -6675,9 +6675,9 @@ unique_ptr PEGTransformerFactory::TransformPositionExpress unique_ptr PEGTransformerFactory::TransformPositionArgumentsInternal(PEGTransformer &transformer, ParseResult &parse_result) { auto &list_pr = parse_result.Cast(); - auto single_expression = transformer.Transform>(list_pr.GetChild(0)); - auto single_expression_1 = transformer.Transform>(list_pr.GetChild(2)); - auto result = TransformPositionArguments(transformer, std::move(single_expression), std::move(single_expression_1)); + auto other_operator_expression = transformer.Transform>(list_pr.GetChild(0)); + auto expression = transformer.Transform>(list_pr.GetChild(2)); + auto result = TransformPositionArguments(transformer, std::move(other_operator_expression), std::move(expression)); return make_uniq>>>(std::move(result)); } diff --git a/src/duckdb/src/parser/peg/transformer/transform_load.cpp b/src/duckdb/src/parser/peg/transformer/transform_load.cpp index 6fce3548f..cdbbc06cf 100644 --- a/src/duckdb/src/parser/peg/transformer/transform_load.cpp +++ b/src/duckdb/src/parser/peg/transformer/transform_load.cpp @@ -32,7 +32,7 @@ unique_ptr PEGTransformerFactory::TransformInstallStatement( const optional &from_source, const optional &version_number) { auto result = make_uniq(); auto info = make_uniq(); - info->load_type = LoadType::INSTALL; + info->load_type = has_result ? LoadType::FORCE_INSTALL : LoadType::INSTALL; info->filename = identifier_or_string_literal.Name().GetIdentifierName(); info->repo_is_alias = false; if (from_source) { From 70aa834e946b7daf85d4a709d0e89bfa1dae4553 Mon Sep 17 00:00:00 2001 From: DuckDB Labs GitHub Bot Date: Thu, 2 Jul 2026 14:06:05 +0000 Subject: [PATCH 2/2] Update vendored DuckDB sources to 1da5b6c8cf --- .../src/common/row_operations/row_matcher.cpp | 18 +- .../common/types/row/tuple_data_layout.cpp | 10 +- .../comparison_operators.cpp | 239 +++++++-- .../vector_operations/is_distinct_from.cpp | 16 - .../src/execution/aggregate_hashtable.cpp | 5 +- .../scalar/geometry/geometry_functions.cpp | 54 ++ .../function/table/version/pragma_version.cpp | 9 +- .../common/multi_file/multi_file_states.hpp | 1 + .../common/types/row/tuple_data_layout.hpp | 3 +- .../vector_operations/vector_operations.hpp | 13 - .../duckdb/function/scalar_function.hpp | 16 +- .../include/duckdb/main/client_context.hpp | 35 +- .../include/duckdb/main/extension_entries.hpp | 5 + .../include/duckdb/main/parse_iterator.hpp | 94 ++++ .../duckdb/main/statement_iterator.hpp | 79 +++ .../optimizer/outer_join_simplification.hpp | 55 +- .../src/include/duckdb/parser/parser.hpp | 12 + .../duckdb/parser/peg/inlined_grammar.hpp | 7 +- .../parser/peg/tokenizer/parser_tokenizer.hpp | 1 + .../peg/transformer/peg_transformer.hpp | 14 + .../storage/statistics/geometry_stats.hpp | 4 - src/duckdb/src/main/client_context.cpp | 237 +++++--- src/duckdb/src/main/connection.cpp | 12 +- src/duckdb/src/main/parse_iterator.cpp | 184 +++++++ src/duckdb/src/main/statement_iterator.cpp | 67 +++ .../optimizer/outer_join_simplification.cpp | 505 ++++++++++++++---- .../src/optimizer/pushdown/pushdown_get.cpp | 34 +- .../optimizer/rule/contains_to_in_clause.cpp | 39 +- .../statistics/expression/propagate_cast.cpp | 5 + src/duckdb/src/optimizer/type_pushdown.cpp | 7 +- src/duckdb/src/parser/parser.cpp | 113 ++-- .../parser/peg/tokenizer/parser_tokenizer.cpp | 11 + .../peg/transformer/transform_generated.cpp | 37 ++ .../peg/transformer/transform_select.cpp | 19 + .../expression/bind_unnest_expression.cpp | 2 +- .../src/planner/filter/expression_filter.cpp | 24 +- .../filter/table_filter_bloom_function.cpp | 25 +- .../filter/table_filter_dynamic_function.cpp | 6 +- .../filter/table_filter_optional_function.cpp | 6 +- .../table_filter_prefix_range_function.cpp | 19 +- ...e_filter_selectivity_optional_function.cpp | 6 +- .../src/storage/statistics/geometry_stats.cpp | 113 ---- src/duckdb/ub_src_main.cpp | 4 + 43 files changed, 1669 insertions(+), 496 deletions(-) create mode 100644 src/duckdb/src/include/duckdb/main/parse_iterator.hpp create mode 100644 src/duckdb/src/include/duckdb/main/statement_iterator.hpp create mode 100644 src/duckdb/src/main/parse_iterator.cpp create mode 100644 src/duckdb/src/main/statement_iterator.cpp diff --git a/src/duckdb/src/common/row_operations/row_matcher.cpp b/src/duckdb/src/common/row_operations/row_matcher.cpp index 6869fe97e..06b8e67f4 100644 --- a/src/duckdb/src/common/row_operations/row_matcher.cpp +++ b/src/duckdb/src/common/row_operations/row_matcher.cpp @@ -1,5 +1,4 @@ #include "duckdb/common/vector/flat_vector.hpp" -#include "duckdb/common/vector/map_vector.hpp" #include "duckdb/common/vector/struct_vector.hpp" #include "duckdb/common/row_operations/row_matcher.hpp" @@ -164,13 +163,13 @@ static idx_t SelectComparison(const Vector &, const Vector &, const SelectionVec template <> idx_t SelectComparison(const Vector &left, const Vector &right, const SelectionVector &sel, idx_t count, SelectionVector *true_sel, SelectionVector *false_sel) { - return VectorOperations::NestedEquals(left, right, &sel, count, true_sel, false_sel); + return VectorOperations::Equals(left, right, &sel, count, true_sel, false_sel); } template <> idx_t SelectComparison(const Vector &left, const Vector &right, const SelectionVector &sel, idx_t count, SelectionVector *true_sel, SelectionVector *false_sel) { - return VectorOperations::NestedNotEquals(left, right, &sel, count, true_sel, false_sel); + return VectorOperations::NotEquals(left, right, &sel, count, true_sel, false_sel); } template <> @@ -381,8 +380,11 @@ MatchFunction RowMatcher::GetStructMatchFunction(const LogicalType &type, const ExpressionType child_predicate = predicate; switch (predicate) { case ExpressionType::COMPARE_EQUAL: - result.function = StructMatchEquality; - child_predicate = ExpressionType::COMPARE_NOT_DISTINCT_FROM; + if (type.id() == LogicalTypeId::UNION) { + result.function = GenericNestedMatch; + } else { + result.function = StructMatchEquality; + } break; case ExpressionType::COMPARE_NOTEQUAL: result.function = GenericNestedMatch; @@ -391,7 +393,11 @@ MatchFunction RowMatcher::GetStructMatchFunction(const LogicalType &type, const result.function = GenericNestedMatch; return result; case ExpressionType::COMPARE_NOT_DISTINCT_FROM: - result.function = StructMatchEquality; + if (type.id() == LogicalTypeId::UNION) { + result.function = GenericNestedMatch; + } else { + result.function = StructMatchEquality; + } break; case ExpressionType::COMPARE_GREATERTHAN: result.function = GenericNestedMatch; diff --git a/src/duckdb/src/common/types/row/tuple_data_layout.cpp b/src/duckdb/src/common/types/row/tuple_data_layout.cpp index 3423a6aed..6adf9809f 100644 --- a/src/duckdb/src/common/types/row/tuple_data_layout.cpp +++ b/src/duckdb/src/common/types/row/tuple_data_layout.cpp @@ -1,6 +1,5 @@ #include "duckdb/common/types/row/tuple_data_layout.hpp" -#include "duckdb/planner/expression/bound_aggregate_expression.hpp" #include "duckdb/common/sorting/sort_key.hpp" namespace duckdb { @@ -228,4 +227,13 @@ bool TupleDataLayout::IsSortKeyLayout() const { return sort_key_type != SortKeyType::INVALID; } +bool TupleDataLayout::HasNestedTypes() const { + for (const auto &type : types) { + if (type.IsNested()) { + return true; + } + } + return false; +} + } // namespace duckdb diff --git a/src/duckdb/src/common/vector_operations/comparison_operators.cpp b/src/duckdb/src/common/vector_operations/comparison_operators.cpp index db33f0e73..8f7f47dd9 100644 --- a/src/duckdb/src/common/vector_operations/comparison_operators.cpp +++ b/src/duckdb/src/common/vector_operations/comparison_operators.cpp @@ -8,13 +8,11 @@ #include "duckdb/common/uhugeint.hpp" #include "duckdb/common/types/variant.hpp" -#include "duckdb/common/value_operations/value_operations.hpp" #include "duckdb/common/vector/array_vector.hpp" #include "duckdb/common/vector/flat_vector.hpp" #include "duckdb/common/vector/list_vector.hpp" #include "duckdb/common/vector/struct_vector.hpp" #include "duckdb/common/vector/vector_iterator.hpp" -#include "duckdb/function/scalar/variant_utils.hpp" #include "duckdb/common/vector_operations/binary_executor.hpp" #include "duckdb/common/vector_operations/vector_operations.hpp" @@ -253,8 +251,25 @@ void VectorOperations::LessThanEquals(const Vector &left, const Vector &right, V struct StandardComparatorExecute { template - static inline void Execute(const Vector &left, const Vector &right, Vector &result, idx_t count) { - BinaryExecutor::Execute(left, right, result, count); + static inline void Execute(const Vector &left, const Vector &right, int8_t *result_data, + const SelectionVector &lhs_sel, const SelectionVector &rhs_sel, idx_t sel_count, + ValidityMask &result_validity) { + UnifiedVectorFormat left_format, right_format; + left.ToUnifiedFormat(left_format); + right.ToUnifiedFormat(right_format); + auto ldata = UnifiedVectorFormat::GetData(left_format); + auto rdata = UnifiedVectorFormat::GetData(right_format); + for (idx_t i = 0; i < sel_count; i++) { + auto lidx = left_format.sel->get_index(lhs_sel.get_index(i)); + auto ridx = right_format.sel->get_index(rhs_sel.get_index(i)); + bool left_null = !left_format.validity.RowIsValid(lidx); + bool right_null = !right_format.validity.RowIsValid(ridx); + if (left_null || right_null) { + result_validity.SetInvalid(i); + } else { + result_data[i] = duckdb::Comparator::Operation(ldata[lidx], rdata[ridx]); + } + } } }; @@ -293,6 +308,10 @@ static int8_t DistinctNullComparator(bool left_null, bool right_null) { return Comparator::RIGHT_IS_GREATER; } +static void ComparatorTypeSwitch(const Vector &left, const Vector &right, int8_t *result_data, + const SelectionVector &lhs_sel, const SelectionVector &rhs_sel, idx_t sel_count, + ValidityMask &validity); + static void StructComparator(const Vector &left, const Vector &right, int8_t *result_data, const SelectionVector &lhs_sel, const SelectionVector &rhs_sel, idx_t sel_count, optional_ptr result_validity = nullptr) { @@ -348,20 +367,59 @@ static void StructComparator(const Vector &left, const Vector &right, int8_t *re // step 2: compare child vectors one by one // child results are written densely, then scattered back to the correct output positions auto child_result = make_unsafe_uniq_array(remaining_count); + ValidityMask child_validity(remaining_count); + const bool is_union = (left.GetType().id() == LogicalTypeId::UNION); for (idx_t child_idx = 0; child_idx < lchildren.size() && remaining_count > 0; child_idx++) { - DistinctComparatorTypeSwitch(lchildren[child_idx], rchildren[child_idx], child_result.get(), remaining_lhs_sel, - remaining_rhs_sel, remaining_count); - idx_t new_remaining_count = 0; + if (!result_validity) { + // DISTINCT + DistinctComparatorTypeSwitch(lchildren[child_idx], rchildren[child_idx], child_result.get(), + remaining_lhs_sel, remaining_rhs_sel, remaining_count); + } else { + // regular comparison - set NULL if any value is NULL + child_validity.SetAllValid(remaining_count); + ComparatorTypeSwitch(lchildren[child_idx], rchildren[child_idx], child_result.get(), remaining_lhs_sel, + remaining_rhs_sel, remaining_count, child_validity); + } + + if (is_union && child_idx && result_validity) { + // For SQL-equality comparisons of UNION types, + // we don't know if the NULL means NULL or a different column + // So we have to further restrict the comparison by column index + auto &key = lchildren[0]; + UnifiedVectorFormat key_format; + key.ToUnifiedFormat(key_format); + const uint8_t *key_data = UnifiedVectorFormat::GetData(key_format); + for (idx_t i = 0; i < remaining_count; i++) { + const auto remaining_idx = remaining_result_sel.get_index(i); + const idx_t key_idx = key_format.sel->get_index(remaining_lhs_sel.get_index(i)); + // Skip if not the current column + if (key_data[key_idx] != child_idx - 1) { + remaining_lhs_sel.set_index(new_remaining_count, remaining_lhs_sel.get_index(i)); + remaining_rhs_sel.set_index(new_remaining_count, remaining_rhs_sel.get_index(i)); + remaining_result_sel.set_index(new_remaining_count, remaining_idx); + new_remaining_count++; + } else if (!child_validity.RowIsValidUnsafe(i)) { + result_validity->SetInvalid(remaining_idx); + } else { + result_data[remaining_idx] = child_result[i]; + } + } + continue; + } + for (idx_t i = 0; i < remaining_count; i++) { - if (child_result[i] != Comparator::VALUES_ARE_EQUAL) { + const auto remaining_idx = remaining_result_sel.get_index(i); + if (result_validity && !child_validity.RowIsValidUnsafe(i)) { + result_validity->SetInvalid(remaining_idx); + } else if (child_result[i] != Comparator::VALUES_ARE_EQUAL) { // not equal at this position - we found the final result for this row - result_data[remaining_result_sel.get_index(i)] = child_result[i]; + result_data[remaining_idx] = child_result[i]; } else { // still equal at this position - need to check the next entry remaining_lhs_sel.set_index(new_remaining_count, remaining_lhs_sel.get_index(i)); remaining_rhs_sel.set_index(new_remaining_count, remaining_rhs_sel.get_index(i)); - remaining_result_sel.set_index(new_remaining_count, remaining_result_sel.get_index(i)); + remaining_result_sel.set_index(new_remaining_count, remaining_idx); new_remaining_count++; } } @@ -470,6 +528,7 @@ static void ListOrArrayComparator(const Vector &left, const Vector &right, int8_ SelectionVector left_child_sel(remaining_count); SelectionVector right_child_sel(remaining_count); auto child_result = make_unsafe_uniq_array(remaining_count); + ValidityMask child_validity(remaining_count); for (idx_t index_in_list = 0; remaining_count > 0; index_in_list++) { // partition remaining into: exhausted (one or both ended) vs active (both have element at pos) @@ -503,18 +562,29 @@ static void ListOrArrayComparator(const Vector &left, const Vector &right, int8_ } // compare child elements at this position - DistinctComparatorTypeSwitch(left_child, right_child, child_result.get(), left_child_sel, right_child_sel, - active_count); + if (!result_validity) { + // DISTINCT + DistinctComparatorTypeSwitch(left_child, right_child, child_result.get(), left_child_sel, right_child_sel, + active_count); + } else { + // regular comparison - set NULL if any value is NULL + child_validity.SetAllValid(remaining_count); + ComparatorTypeSwitch(left_child, right_child, child_result.get(), left_child_sel, right_child_sel, + active_count, child_validity); + } // partition active into resolved vs still-remaining idx_t new_remaining_count = 0; for (idx_t i = 0; i < active_count; i++) { - if (child_result[i] != 0) { - result_data[remaining_result_sel.get_index(i)] = child_result[i]; + const auto remaining_idx = remaining_result_sel.get_index(i); + if (result_validity && !child_validity.RowIsValidUnsafe(i)) { + result_validity->SetInvalid(remaining_idx); + } else if (child_result[i] != Comparator::VALUES_ARE_EQUAL) { + result_data[remaining_idx] = child_result[i]; } else { remaining_lhs_sel.set_index(new_remaining_count, remaining_lhs_sel.get_index(i)); remaining_rhs_sel.set_index(new_remaining_count, remaining_rhs_sel.get_index(i)); - remaining_result_sel.set_index(new_remaining_count, remaining_result_sel.get_index(i)); + remaining_result_sel.set_index(new_remaining_count, remaining_idx); new_remaining_count++; } } @@ -536,9 +606,9 @@ static void ArrayComparator(const Vector &left, const Vector &right, int8_t *res ListOrArrayComparator(left, right, result_data, lhs_sel, rhs_sel, sel_count, accessor, result_validity); } -static void DistinctComparatorTypeSwitchInternal(const Vector &left, const Vector &right, int8_t *result_data, - const SelectionVector &lhs_sel, const SelectionVector &rhs_sel, - idx_t sel_count) { +static void DistinctComparatorTypeSwitch(const Vector &left, const Vector &right, int8_t *result_data, + const SelectionVector &lhs_sel, const SelectionVector &rhs_sel, + idx_t sel_count) { D_ASSERT(left.GetType().InternalType() == right.GetType().InternalType()); switch (left.GetType().InternalType()) { case PhysicalType::BOOL: @@ -598,83 +668,138 @@ static void DistinctComparatorTypeSwitchInternal(const Vector &left, const Vecto } } -static void DistinctComparatorTypeSwitch(const Vector &left, const Vector &right, int8_t *result_data, - const SelectionVector &lhs_sel, const SelectionVector &rhs_sel, - idx_t sel_count) { - DistinctComparatorTypeSwitchInternal(left, right, result_data, lhs_sel, rhs_sel, sel_count); -} - -static void ComparatorTypeSwitch(const Vector &left, const Vector &right, Vector &result, idx_t count) { - D_ASSERT(left.GetType().InternalType() == right.GetType().InternalType() && - result.GetType() == LogicalType::TINYINT); +static void ComparatorTypeSwitch(const Vector &left, const Vector &right, int8_t *result_data, + const SelectionVector &lhs_sel, const SelectionVector &rhs_sel, idx_t sel_count, + ValidityMask &validity) { + D_ASSERT(left.GetType().InternalType() == right.GetType().InternalType()); switch (left.GetType().InternalType()) { case PhysicalType::BOOL: case PhysicalType::INT8: - StandardComparatorExecute::Execute(left, right, result, count); + StandardComparatorExecute::Execute(left, right, result_data, lhs_sel, rhs_sel, sel_count, validity); break; case PhysicalType::INT16: - StandardComparatorExecute::Execute(left, right, result, count); + StandardComparatorExecute::Execute(left, right, result_data, lhs_sel, rhs_sel, sel_count, validity); break; case PhysicalType::INT32: - StandardComparatorExecute::Execute(left, right, result, count); + StandardComparatorExecute::Execute(left, right, result_data, lhs_sel, rhs_sel, sel_count, validity); break; case PhysicalType::INT64: - StandardComparatorExecute::Execute(left, right, result, count); + StandardComparatorExecute::Execute(left, right, result_data, lhs_sel, rhs_sel, sel_count, validity); break; case PhysicalType::UINT8: - StandardComparatorExecute::Execute(left, right, result, count); + StandardComparatorExecute::Execute(left, right, result_data, lhs_sel, rhs_sel, sel_count, validity); break; case PhysicalType::UINT16: - StandardComparatorExecute::Execute(left, right, result, count); + StandardComparatorExecute::Execute(left, right, result_data, lhs_sel, rhs_sel, sel_count, validity); break; case PhysicalType::UINT32: - StandardComparatorExecute::Execute(left, right, result, count); + StandardComparatorExecute::Execute(left, right, result_data, lhs_sel, rhs_sel, sel_count, validity); break; case PhysicalType::UINT64: - StandardComparatorExecute::Execute(left, right, result, count); + StandardComparatorExecute::Execute(left, right, result_data, lhs_sel, rhs_sel, sel_count, validity); break; case PhysicalType::INT128: - StandardComparatorExecute::Execute(left, right, result, count); + StandardComparatorExecute::Execute(left, right, result_data, lhs_sel, rhs_sel, sel_count, validity); break; case PhysicalType::UINT128: - StandardComparatorExecute::Execute(left, right, result, count); + StandardComparatorExecute::Execute(left, right, result_data, lhs_sel, rhs_sel, sel_count, validity); break; case PhysicalType::FLOAT: - StandardComparatorExecute::Execute(left, right, result, count); + StandardComparatorExecute::Execute(left, right, result_data, lhs_sel, rhs_sel, sel_count, validity); break; case PhysicalType::DOUBLE: - StandardComparatorExecute::Execute(left, right, result, count); + StandardComparatorExecute::Execute(left, right, result_data, lhs_sel, rhs_sel, sel_count, validity); break; case PhysicalType::INTERVAL: - StandardComparatorExecute::Execute(left, right, result, count); + StandardComparatorExecute::Execute(left, right, result_data, lhs_sel, rhs_sel, sel_count, validity); break; case PhysicalType::VARCHAR: - StandardComparatorExecute::Execute(left, right, result, count); + StandardComparatorExecute::Execute(left, right, result_data, lhs_sel, rhs_sel, sel_count, validity); break; case PhysicalType::STRUCT: + StructComparator(left, right, result_data, lhs_sel, rhs_sel, sel_count, validity); + break; case PhysicalType::LIST: - case PhysicalType::ARRAY: { - result.SetVectorType(VectorType::FLAT_VECTOR); - auto result_data = FlatVector::GetDataMutable(result); - auto &validity = FlatVector::ValidityMutable(result); - auto &sel = *FlatVector::IncrementalSelectionVector(); - auto physical_type = left.GetType().InternalType(); - if (physical_type == PhysicalType::STRUCT) { - StructComparator(left, right, result_data, sel, sel, count, validity); - } else if (physical_type == PhysicalType::LIST) { - ListComparator(left, right, result_data, sel, sel, count, validity); - } else { - ArrayComparator(left, right, result_data, sel, sel, count, validity); - } + ListComparator(left, right, result_data, lhs_sel, rhs_sel, sel_count, validity); + break; + case PhysicalType::ARRAY: + ArrayComparator(left, right, result_data, lhs_sel, rhs_sel, sel_count, validity); break; - } default: throw InternalException("Invalid type for comparator"); } } +template +static void ComparatorExecute(const Vector &left, const Vector &right, Vector &result, idx_t count) { + BinaryExecutor::Execute(left, right, result, count); +} + +template +static bool TryPrimitiveComparatorExecute(const Vector &left, const Vector &right, Vector &result, idx_t count) { +#ifdef DUCKDB_SMALLER_BINARY + return false; +#else + D_ASSERT(left.GetType().InternalType() == right.GetType().InternalType()); + switch (left.GetType().InternalType()) { + case PhysicalType::BOOL: + case PhysicalType::INT8: + ComparatorExecute(left, right, result, count); + return true; + case PhysicalType::INT16: + ComparatorExecute(left, right, result, count); + return true; + case PhysicalType::INT32: + ComparatorExecute(left, right, result, count); + return true; + case PhysicalType::INT64: + ComparatorExecute(left, right, result, count); + return true; + case PhysicalType::UINT8: + ComparatorExecute(left, right, result, count); + return true; + case PhysicalType::UINT16: + ComparatorExecute(left, right, result, count); + return true; + case PhysicalType::UINT32: + ComparatorExecute(left, right, result, count); + return true; + case PhysicalType::UINT64: + ComparatorExecute(left, right, result, count); + return true; + case PhysicalType::INT128: + ComparatorExecute(left, right, result, count); + return true; + case PhysicalType::UINT128: + ComparatorExecute(left, right, result, count); + return true; + case PhysicalType::FLOAT: + ComparatorExecute(left, right, result, count); + return true; + case PhysicalType::DOUBLE: + ComparatorExecute(left, right, result, count); + return true; + case PhysicalType::INTERVAL: + ComparatorExecute(left, right, result, count); + return true; + case PhysicalType::VARCHAR: + ComparatorExecute(left, right, result, count); + return true; + default: + return false; + } +#endif +} + void VectorOperations::ComparatorFill(const Vector &left, const Vector &right, Vector &result, idx_t count) { - ComparatorTypeSwitch(left, right, result, count); + D_ASSERT(result.GetType() == LogicalType::TINYINT); + if (!TryPrimitiveComparatorExecute(left, right, result, count)) { + result.SetVectorType(VectorType::FLAT_VECTOR); + auto result_data = FlatVector::GetDataMutable(result); + auto &sel = *FlatVector::IncrementalSelectionVector(); + auto &validity = FlatVector::ValidityMutable(result); + ComparatorTypeSwitch(left, right, result_data, sel, sel, count, validity); + } FlatVector::SetSize(result, count); } @@ -785,7 +910,7 @@ void VectorOperations::DistinctComparatorFill(const Vector &left, const Vector & result.SetVectorType(VectorType::FLAT_VECTOR); auto result_data = FlatVector::GetDataMutable(result); auto &sel = *FlatVector::IncrementalSelectionVector(); - DistinctComparatorTypeSwitchInternal(left, right, result_data, sel, sel, count); + DistinctComparatorTypeSwitch(left, right, result_data, sel, sel, count); } FlatVector::SetSize(result, count); } diff --git a/src/duckdb/src/common/vector_operations/is_distinct_from.cpp b/src/duckdb/src/common/vector_operations/is_distinct_from.cpp index 33e010f72..f2aa911c1 100644 --- a/src/duckdb/src/common/vector_operations/is_distinct_from.cpp +++ b/src/duckdb/src/common/vector_operations/is_distinct_from.cpp @@ -132,20 +132,4 @@ idx_t VectorOperations::DistinctLessThanEquals(const Vector &left, const Vector VectorOperations::DistinctComparatorFill, [](int8_t v) { return v <= 0; }); } -// true := A != B with nulls being equal, inputs selected -idx_t VectorOperations::NestedNotEquals(const Vector &left, const Vector &right, - optional_ptr sel, idx_t count, - optional_ptr true_sel, optional_ptr false_sel, - optional_ptr null_mask) { - return DistinctComparatorSelect(left, right, sel, count, true_sel, false_sel, - VectorOperations::DistinctComparatorFill, [](int8_t v) { return v != 0; }); -} -// true := A == B with nulls being equal, inputs selected -idx_t VectorOperations::NestedEquals(const Vector &left, const Vector &right, optional_ptr sel, - idx_t count, optional_ptr true_sel, - optional_ptr false_sel, optional_ptr null_mask) { - return DistinctComparatorSelect(left, right, sel, count, true_sel, false_sel, - VectorOperations::DistinctComparatorFill, [](int8_t v) { return v == 0; }); -} - } // namespace duckdb diff --git a/src/duckdb/src/execution/aggregate_hashtable.cpp b/src/duckdb/src/execution/aggregate_hashtable.cpp index b90869402..46d20bea0 100644 --- a/src/duckdb/src/execution/aggregate_hashtable.cpp +++ b/src/duckdb/src/execution/aggregate_hashtable.cpp @@ -70,8 +70,9 @@ GroupedAggregateHashTable::GroupedAggregateHashTable(ClientContext &context_p, A Resize(initial_capacity); // Predicates - const auto expr_type = - layout_ptr->CannotHaveNull() ? ExpressionType::COMPARE_EQUAL : ExpressionType::COMPARE_NOT_DISTINCT_FROM; + const auto expr_type = (layout_ptr->CannotHaveNull() && !layout_ptr->HasNestedTypes()) + ? ExpressionType::COMPARE_EQUAL + : ExpressionType::COMPARE_NOT_DISTINCT_FROM; predicates.resize(layout_ptr->ColumnCount() - 1, expr_type); row_matcher.Initialize(true, *layout_ptr, predicates); diff --git a/src/duckdb/src/function/scalar/geometry/geometry_functions.cpp b/src/duckdb/src/function/scalar/geometry/geometry_functions.cpp index 990229bd9..0de2f49f5 100644 --- a/src/duckdb/src/function/scalar/geometry/geometry_functions.cpp +++ b/src/duckdb/src/function/scalar/geometry/geometry_functions.cpp @@ -4,6 +4,8 @@ #include "duckdb/common/vector_operations/binary_executor.hpp" #include "duckdb/execution/expression_executor.hpp" #include "duckdb/planner/expression/bound_constant_expression.hpp" +#include "duckdb/planner/expression/bound_function_expression.hpp" +#include "duckdb/storage/statistics/base_statistics.hpp" #include "duckdb/storage/statistics/geometry_stats.hpp" #include "duckdb/storage/statistics/string_stats.hpp" @@ -136,9 +138,61 @@ static void IntersectsExtentFunction(DataChunk &input, ExpressionState &state, V }); } +// Prune row groups for `geom1 && geom2` (a.k.a. ST_Intersects_Extent), which is true iff the two bounding +// boxes intersect. One argument must be a constant geometry; the other is the column we prune against. Both +// operands' statistics are derived for us (the constant's geometry stats already carry its bounding box, and +// the column's look through any CRS-only cast), so this works regardless of which side the constant is on. +static FilterPropagateResult IntersectsExtentFilterPrune(const FunctionStatisticsPruneInput &input) { + auto &children = input.function.GetChildren(); + if (children.size() != 2) { + return FilterPropagateResult::NO_PRUNING_POSSIBLE; + } + + // Constants are folded by the time we get here, so the constant operand is a plain BoundConstantExpression. + const auto lhs_is_const = children[0]->GetExpressionType() == ExpressionType::VALUE_CONSTANT; + const auto rhs_is_const = children[1]->GetExpressionType() == ExpressionType::VALUE_CONSTANT; + if (lhs_is_const == rhs_is_const) { + // Need exactly one constant operand and one column operand. + return FilterPropagateResult::NO_PRUNING_POSSIBLE; + } + const idx_t constant_idx = lhs_is_const ? 0 : 1; + + auto column_stats = input.ChildStats(1 - constant_idx); + auto constant_stats = input.ChildStats(constant_idx); + if (!column_stats || column_stats->GetStatsType() != StatisticsType::GEOMETRY_STATS || !constant_stats || + constant_stats->GetStatsType() != StatisticsType::GEOMETRY_STATS) { + return FilterPropagateResult::NO_PRUNING_POSSIBLE; + } + if (!column_stats->CanHaveNoNull()) { + // no non-null values are possible: always false + return FilterPropagateResult::FILTER_ALWAYS_FALSE; + } + const auto &col_extent = GeometryStats::GetExtent(*column_stats); + if (!col_extent.CanPruneXY()) { + // If neither axis is set (the extent is empty or fully unknown), we cannot prune. + return FilterPropagateResult::NO_PRUNING_POSSIBLE; + } + + const auto &const_extent = GeometryStats::GetExtent(*constant_stats); + if (!const_extent.CanPruneXY()) { + // An empty constant geometry never intersects anything. + return FilterPropagateResult::FILTER_ALWAYS_FALSE; + } + if (!const_extent.IntersectsXY(col_extent)) { + // The column zonemap does not intersect the constant: no row can match. + return FilterPropagateResult::FILTER_ALWAYS_FALSE; + } + if (const_extent.ContainsXY(col_extent)) { + // The constant fully covers the column zonemap, so every non-null row's bbox intersects it. + return FilterPropagateResult::FILTER_ALWAYS_TRUE; + } + return FilterPropagateResult::NO_PRUNING_POSSIBLE; +} + ScalarFunction StIntersectsExtentFun::GetFunction() { ScalarFunction function({LogicalType::GEOMETRY(), LogicalType::GEOMETRY()}, LogicalType::BOOLEAN, IntersectsExtentFunction); + function.SetFilterPruneCallback(IntersectsExtentFilterPrune); return function; } diff --git a/src/duckdb/src/function/table/version/pragma_version.cpp b/src/duckdb/src/function/table/version/pragma_version.cpp index 27d0e236e..d42a2354d 100644 --- a/src/duckdb/src/function/table/version/pragma_version.cpp +++ b/src/duckdb/src/function/table/version/pragma_version.cpp @@ -1,5 +1,5 @@ #ifndef DUCKDB_PATCH_VERSION -#define DUCKDB_PATCH_VERSION "0-dev10027" +#define DUCKDB_PATCH_VERSION "0-dev10089" #endif #ifndef DUCKDB_MINOR_VERSION #define DUCKDB_MINOR_VERSION 6 @@ -8,10 +8,10 @@ #define DUCKDB_MAJOR_VERSION 1 #endif #ifndef DUCKDB_VERSION -#define DUCKDB_VERSION "v1.6.0-dev10027" +#define DUCKDB_VERSION "v1.6.0-dev10089" #endif #ifndef DUCKDB_SOURCE_ID -#define DUCKDB_SOURCE_ID "3cb65aa794" +#define DUCKDB_SOURCE_ID "1da5b6c8cf" #endif #include "duckdb/function/table/system_functions.hpp" #include "duckdb/main/database.hpp" @@ -93,6 +93,9 @@ const char *DuckDB::ReleaseCodename() { if (StringUtil::StartsWith(DUCKDB_VERSION, "v1.5.")) { return "Variegata"; } + if (StringUtil::StartsWith(DUCKDB_VERSION, "v2.0.")) { + return "Cyanoptera"; + } // add new version names here // we should not get here, but let's not fail because of it because tags on forks can be whatever diff --git a/src/duckdb/src/include/duckdb/common/multi_file/multi_file_states.hpp b/src/duckdb/src/include/duckdb/common/multi_file/multi_file_states.hpp index 52a6c2941..45fd80712 100644 --- a/src/duckdb/src/include/duckdb/common/multi_file/multi_file_states.hpp +++ b/src/duckdb/src/include/duckdb/common/multi_file/multi_file_states.hpp @@ -12,6 +12,7 @@ #include "duckdb/common/multi_file/multi_file_options.hpp" #include "duckdb/common/multi_file/base_file_reader.hpp" #include "duckdb/common/multi_file/multi_file_list.hpp" +#include "duckdb/common/windows_undefs.hpp" #include "duckdb/execution/expression_executor.hpp" namespace duckdb { diff --git a/src/duckdb/src/include/duckdb/common/types/row/tuple_data_layout.hpp b/src/duckdb/src/include/duckdb/common/types/row/tuple_data_layout.hpp index ff8893e71..99341c1fb 100644 --- a/src/duckdb/src/include/duckdb/common/types/row/tuple_data_layout.hpp +++ b/src/duckdb/src/include/duckdb/common/types/row/tuple_data_layout.hpp @@ -11,7 +11,6 @@ #include "duckdb/common/enums/tuple_data_layout_enums.hpp" #include "duckdb/common/types/validity_mask.hpp" #include "duckdb/execution/operator/aggregate/aggregate_object.hpp" -#include "duckdb/planner/expression.hpp" #include "duckdb/planner/bound_result_modifier.hpp" namespace duckdb { @@ -129,6 +128,8 @@ class TupleDataLayout { inline bool CanHaveNull() const { return validity_type != TupleDataValidityType::CANNOT_HAVE_NULL_VALUES; } + //! Returns whether any of the columns are nested (they can compare equal even if there are no explicit NULLs) + bool HasNestedTypes() const; private: //! The types of the data columns diff --git a/src/duckdb/src/include/duckdb/common/vector_operations/vector_operations.hpp b/src/duckdb/src/include/duckdb/common/vector_operations/vector_operations.hpp index ff2bb4004..6abacc042 100644 --- a/src/duckdb/src/include/duckdb/common/vector_operations/vector_operations.hpp +++ b/src/duckdb/src/include/duckdb/common/vector_operations/vector_operations.hpp @@ -151,19 +151,6 @@ struct VectorOperations { optional_ptr false_sel, optional_ptr null_mask = nullptr); - //===--------------------------------------------------------------------===// - // Nested Comparisons - //===--------------------------------------------------------------------===// - // true := A != B with nulls being equal - static idx_t NestedNotEquals(const Vector &left, const Vector &right, optional_ptr sel, - idx_t count, optional_ptr true_sel, - optional_ptr false_sel, - optional_ptr null_mask = nullptr); - // true := A == B with nulls being equal - static idx_t NestedEquals(const Vector &left, const Vector &right, optional_ptr sel, - idx_t count, optional_ptr true_sel, - optional_ptr false_sel, optional_ptr null_mask = nullptr); - //===--------------------------------------------------------------------===// // Hash functions //===--------------------------------------------------------------------===// diff --git a/src/duckdb/src/include/duckdb/function/scalar_function.hpp b/src/duckdb/src/include/duckdb/function/scalar_function.hpp index fd57c9e59..0b01dba31 100644 --- a/src/duckdb/src/include/duckdb/function/scalar_function.hpp +++ b/src/duckdb/src/include/duckdb/function/scalar_function.hpp @@ -59,12 +59,22 @@ class ScalarFunctionCatalogEntry; struct StatementProperties; struct FunctionStatisticsPruneInput { - FunctionStatisticsPruneInput(optional_ptr bind_data_p, const BaseStatistics &stats_p) - : bind_data(bind_data_p), stats(stats_p) { + FunctionStatisticsPruneInput(const BoundFunctionExpression &function_p, optional_ptr bind_data_p, + const vector> &child_stats_p) + : function(function_p), bind_data(bind_data_p), child_stats(child_stats_p) { } + //! The bound function expression being checked (gives access to the argument expressions) + const BoundFunctionExpression &function; optional_ptr bind_data; - const BaseStatistics &stats; + + //! Statistics for each function argument (an entry is null if it could not be derived for that argument) + const vector> &child_stats; + + //! Convenience accessor: statistics of the i-th argument, or null if absent / not derivable + optional_ptr ChildStats(idx_t i) const { + return i < child_stats.size() ? child_stats[i] : optional_ptr(); + } }; struct FunctionStatisticsInput { diff --git a/src/duckdb/src/include/duckdb/main/client_context.hpp b/src/duckdb/src/include/duckdb/main/client_context.hpp index a70e33da9..385aeb344 100644 --- a/src/duckdb/src/include/duckdb/main/client_context.hpp +++ b/src/duckdb/src/include/duckdb/main/client_context.hpp @@ -24,6 +24,7 @@ #include "duckdb/main/client_properties.hpp" #include "duckdb/main/external_dependencies.hpp" #include "duckdb/main/pending_query_result.hpp" +#include "duckdb/main/statement_iterator.hpp" #include "duckdb/main/prepared_statement.hpp" #include "duckdb/main/stream_query_result.hpp" #include "duckdb/main/table_description.hpp" @@ -64,6 +65,22 @@ struct PendingQueryParameters { QueryParameters query_parameters; }; +//! A statement parameter: identifier ($1 -> "1"), binding index, and inferred type (UNKNOWN if not inferred). +struct StatementParameter { + Identifier identifier; + idx_t index; + LogicalType type; +}; + +//! A bound statement's signature: result schema (names/types) and parameter schema, without a +//! PreparedStatement. names/types are never empty; parameters are unordered (sort by index for positional use). +struct StatementSignature { + vector names; + vector types; + vector parameters; + StatementProperties properties; +}; + //! Interrupt state for the client context enum class ClientInterruptState : uint8_t { NOT_INTERRUPTED, INTERRUPTED, INTERRUPTS_SUPPRESSED }; @@ -183,6 +200,9 @@ class ClientContext : public enable_shared_from_this { DUCKDB_API unique_ptr Prepare(const string &query); //! Directly prepare a SQL statement DUCKDB_API unique_ptr Prepare(unique_ptr statement); + //! Bind a statement and return its signature, without building a PreparedStatement, optimizing, or + //! executing. Read-only: binding touches no in-flight query state, so a live result survives. Throws on error. + DUCKDB_API StatementSignature BindStatement(unique_ptr statement); //! Create a pending query result from a prepared statement with the given name and set of parameters //! It is possible that the prepared statement will be re-bound. This will generally happen if the catalog is @@ -207,12 +227,19 @@ class ClientContext : public enable_shared_from_this { //! Register function in the temporary schema DUCKDB_API void RegisterFunction(CreateFunctionInfo &info); - //! Parse statements from a query - DUCKDB_API vector> ParseStatements(const string &query); + //! Iterate a query's statements as a StatementIterator (iterator-style API). The caller drives + //! Peek() + GetStatement() to walk through ready-to-execute statements one by one + DUCKDB_API StatementIterator IterateStatements(const string &query); + + //! Preprocess a peel of parse-facing statements into engine-facing ones (PRAGMA reparse, + //! MULTI_STATEMENT unpack, transaction wrapping), replacing `buffer` in place. Acquires the + //! context lock internally when `lock` is null (callers that do not already hold it, e.g. the + //! shell). Drives StatementIterator's preprocessing. + DUCKDB_API void PreprocessStatements(vector> &buffer, + optional_ptr lock = nullptr); //! Extract the logical plan of a query DUCKDB_API unique_ptr ExtractPlan(const string &query); - DUCKDB_API void PreprocessStatements(vector> &statements); //! Runs a function with a valid transaction context, potentially starting a transaction if the context is in auto //! commit mode. @@ -260,8 +287,6 @@ class ClientContext : public enable_shared_from_this { DUCKDB_API LogicalType ParseLogicalType(const string &type); private: - //! Parse statements and resolve pragmas from a query - vector> ParseStatements(ClientContextLock &lock, const string &query); //! Issues a query to the database and returns a Pending Query Result unique_ptr PendingQueryInternal(ClientContextLock &lock, unique_ptr statement, const PendingQueryParameters ¶meters, bool verify = true); diff --git a/src/duckdb/src/include/duckdb/main/extension_entries.hpp b/src/duckdb/src/include/duckdb/main/extension_entries.hpp index a6a6f516a..7fb80c0ff 100644 --- a/src/duckdb/src/include/duckdb/main/extension_entries.hpp +++ b/src/duckdb/src/include/duckdb/main/extension_entries.hpp @@ -129,6 +129,10 @@ static constexpr ExtensionFunctionEntry EXTENSION_FUNCTIONS[] = { {"ceiling", "core_functions", CatalogType::SCALAR_FUNCTION_ENTRY}, {"check_peg_parser", "autocomplete", CatalogType::TABLE_FUNCTION_ENTRY}, {"chr", "core_functions", CatalogType::SCALAR_FUNCTION_ENTRY}, + {"cloudformation_create_stack", "aws", CatalogType::TABLE_FUNCTION_ENTRY}, + {"cloudformation_delete_stack", "aws", CatalogType::TABLE_FUNCTION_ENTRY}, + {"cloudformation_describe_stack", "aws", CatalogType::TABLE_FUNCTION_ENTRY}, + {"cloudformation_list_stacks", "aws", CatalogType::TABLE_FUNCTION_ENTRY}, {"corr", "core_functions", CatalogType::AGGREGATE_FUNCTION_ENTRY}, {"cos", "core_functions", CatalogType::SCALAR_FUNCTION_ENTRY}, {"cosh", "core_functions", CatalogType::SCALAR_FUNCTION_ENTRY}, @@ -153,6 +157,7 @@ static constexpr ExtensionFunctionEntry EXTENSION_FUNCTIONS[] = { {"delta_scan", "delta", CatalogType::TABLE_FUNCTION_ENTRY}, {"drop_fts_index", "fts", CatalogType::PRAGMA_FUNCTION_ENTRY}, {"dsdgen", "tpcds", CatalogType::TABLE_FUNCTION_ENTRY}, + {"duckdb_aws_session_id", "aws", CatalogType::SCALAR_FUNCTION_ENTRY}, {"duckdb_format_sql", "autocomplete", CatalogType::SCALAR_FUNCTION_ENTRY}, {"duckdb_proj_compiled_version", "spatial", CatalogType::SCALAR_FUNCTION_ENTRY}, {"duckdb_proj_version", "spatial", CatalogType::SCALAR_FUNCTION_ENTRY}, diff --git a/src/duckdb/src/include/duckdb/main/parse_iterator.hpp b/src/duckdb/src/include/duckdb/main/parse_iterator.hpp new file mode 100644 index 000000000..738219974 --- /dev/null +++ b/src/duckdb/src/include/duckdb/main/parse_iterator.hpp @@ -0,0 +1,94 @@ +//===----------------------------------------------------------------------===// +// DuckDB +// +// duckdb/main/parse_iterator.hpp +// +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include "duckdb/common/string.hpp" +#include "duckdb/common/unique_ptr.hpp" +#include "duckdb/common/vector.hpp" + +namespace duckdb { +class ClientContext; +class Parser; +class SQLStatement; +struct MatcherToken; + +//! Iterator over the parse-facing statements of a multi-statement query. +//! +//! Usage: +//! ParseIterator it(context, sql); +//! while (it.Peek()) { +//! auto stmt = it.GetStatement(); +//! // raw, just-parsed statement +//! } +//! +//! Separator-only stretches (e.g. ";;;") are transparently skipped inside Peek — the caller +//! always sees either a real statement or a clean exhaustion. +class ParseIterator { +public: + //! Peel parse-facing statements out of a SQL string (PEG / parser_override). The context is + //! bound for the lifetime of the iterator and must outlive it. + DUCKDB_API ParseIterator(ClientContext &context, const string &sql); + DUCKDB_API ~ParseIterator(); + + ParseIterator(const ParseIterator &) = delete; + ParseIterator &operator=(const ParseIterator &) = delete; + DUCKDB_API ParseIterator(ParseIterator &&) noexcept; + // Not move-assignable: holds a ClientContext reference, which cannot be rebound. + ParseIterator &operator=(ParseIterator &&) = delete; + + //! Returns true if a statement is currently available (after parsing as needed). Returns + //! false when the iterator is exhausted. Non-const: parses on demand and buffers the result. + DUCKDB_API bool Peek(); + + //! Returns the next buffered statement and clears the buffer. Returns nullptr if no + //! statement is buffered — the caller should call Peek() first. + DUCKDB_API unique_ptr GetStatement(); + + //! Grammar-free predicate: does another statement remain in the input? Unlike Peek() this never + //! invokes the parser (only tokenizes / walks the token cursor, skipping separators), so it never + //! throws a parser error and is safe as a look-ahead before the current statement has executed. + //! Assumes the input has already been resolved by a prior Peek() (true for all current callers). + DUCKDB_API bool HasMore(); + + //! The context this iterator is bound to (used by StatementIterator to inherit it). + DUCKDB_API ClientContext &GetClientContext(); + +private: + //! Tokenize the full input once (grammar-free); no-op if already tokenized. + void EnsureTokenized(); + +private: + //! The bound context, used for parser options / metrics / override extensions. + ClientContext &context; + string sql; + //! Parser instance kept alive across Peek calls so its PEG matcher / transformer caches + //! stay warm. Constructed lazily on the first Peek. + unique_ptr parser; + //! Tokenized view of `sql`. Populated once on the first Peek and walked thereafter via + //! `token_cursor`, avoiding O(N²) re-tokenization across N statements. + unique_ptr> tokens; + //! Index into `tokens` at which the next match starts. + idx_t token_cursor = 0; + //! Single-statement buffer holding the result of the most recent Peek. Cleared by + //! GetStatement. + unique_ptr current_statement; + //! Once Peek determines there are no more statements (cursor past end of tokens), we stay + //! exhausted; subsequent Peek calls return false without re-invoking the parser. + bool exhausted = false; + //! Statements produced by a successful `parser_override` extension; if non-empty the + //! iterator yields these in order instead of running the PEG parser at all. Populated on + //! the first Peek when an extension claims the query. + unique_ptr>> overridden_statements; + //! Cursor into `overridden_statements`. + idx_t override_cursor = 0; + //! True once we've consulted parser_override extensions for this query. + bool override_resolved = false; +}; + +} // namespace duckdb diff --git a/src/duckdb/src/include/duckdb/main/statement_iterator.hpp b/src/duckdb/src/include/duckdb/main/statement_iterator.hpp new file mode 100644 index 000000000..bb6183005 --- /dev/null +++ b/src/duckdb/src/include/duckdb/main/statement_iterator.hpp @@ -0,0 +1,79 @@ +//===----------------------------------------------------------------------===// +// DuckDB +// +// duckdb/main/statement_iterator.hpp +// +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include "duckdb/common/optional_ptr.hpp" +#include "duckdb/common/unique_ptr.hpp" +#include "duckdb/common/vector.hpp" +#include "duckdb/main/parse_iterator.hpp" + +namespace duckdb { +class ClientContext; +class ClientContextLock; +class SQLStatement; + +//! Iterator over the engine-facing statements of a query. +//! +//! Usage: +//! StatementIterator it(ParseIterator(context, sql)); +//! while (it.Peek()) { +//! auto stmt = it.GetStatement(); +//! if (!stmt) { +//! continue; // a peel that preprocessing swallowed (empty expansion) +//! } +//! // ready-to-execute statement +//! } +//! +class StatementIterator { +public: + //! Wrap a lazy parse-facing stream (consumed by move). The context is inherited from the + //! wrapped ParseIterator and must outlive this iterator. + DUCKDB_API explicit StatementIterator(ParseIterator &&parse_iterator); + DUCKDB_API ~StatementIterator(); + + StatementIterator(const StatementIterator &) = delete; + StatementIterator &operator=(const StatementIterator &) = delete; + DUCKDB_API StatementIterator(StatementIterator &&) noexcept; + // Not move-assignable: holds a ClientContext reference, which cannot be rebound. + StatementIterator &operator=(StatementIterator &&) = delete; + + //! Returns true while more input remains (a buffered engine statement, or another parse-facing + //! statement to pull). Parses ahead as needed but does NOT preprocess — safe as a lookahead. + DUCKDB_API bool Peek(); + + //! Grammar-free predicate: is there another statement after the current one? Never parses and + //! never throws (see ParseIterator::HasMore), so it is safe to consult before the current + //! statement has executed — e.g. to decide whether the current statement is the last. + DUCKDB_API bool HasMore(); + + //! Pull + preprocess the next engine-facing statement. Returns nullptr when a peel preprocesses + //! to nothing (skip with `continue`) or when the input is exhausted (Peek would return false). + //! Self-locking variant for callers that do not hold the context lock. + DUCKDB_API unique_ptr GetStatement(); + //! Same, for callers that already hold the context lock. + DUCKDB_API unique_ptr GetStatementWithLock(ClientContextLock &lock); + +private: + //! Shared body for both Get variants. `lock` is null for the self-locking path (preprocessing + //! then acquires the lock itself) or the held lock for callers that already have it. + unique_ptr GetStatementInternal(optional_ptr lock); + +private: + //! The single parse-facing source this iterator preprocesses. Always constructed — the + //! single-statement ctor forwards to ParseIterator's single-statement ctor. + ParseIterator source; + //! The bound context, inherited from `source`. Used for preprocessing / transaction state / locking. + ClientContext &context; + //! Engine-facing statements produced by preprocessing one parse-facing peel. Drained + //! one-at-a-time across GetStatement calls before pulling + preprocessing the next peel. + vector> buffer; + idx_t buffer_cursor = 0; +}; + +} // namespace duckdb diff --git a/src/duckdb/src/include/duckdb/optimizer/outer_join_simplification.hpp b/src/duckdb/src/include/duckdb/optimizer/outer_join_simplification.hpp index 6299642d5..e81ac1711 100644 --- a/src/duckdb/src/include/duckdb/optimizer/outer_join_simplification.hpp +++ b/src/duckdb/src/include/duckdb/optimizer/outer_join_simplification.hpp @@ -8,12 +8,23 @@ #pragma once +#include "duckdb/common/enums/expression_type.hpp" #include "duckdb/planner/column_binding_map.hpp" #include "duckdb/planner/logical_operator_visitor.hpp" namespace duckdb { -//! Simplifies FULL OUTER -> LEFT/RIGHT OUTER -> INNER if NULLs are filtered anyway +class Expression; +struct JoinCondition; +class LogicalAggregate; +class LogicalComparisonJoin; +class LogicalFilter; +class LogicalJoin; +class LogicalOrder; +class LogicalProjection; +class LogicalTopN; + +//! Simplifies outer joins if NULL-extended rows are filtered in a way that changes the join semantics class OuterJoinSimplification : public LogicalOperatorVisitor { public: OuterJoinSimplification(); @@ -22,11 +33,53 @@ class OuterJoinSimplification : public LogicalOperatorVisitor { void VisitOperator(LogicalOperator &op) override; private: + explicit OuterJoinSimplification(column_binding_set_t required_columns); + + //! Extract and propagate column references through expressions and join conditions + void AddColumnReferences(const Expression &expr, column_binding_set_t &bindings); + void AddRequiredColumns(const Expression &expr); + void AddRequiredColumns(const JoinCondition &condition); + void AddRequiredColumns(const vector &conditions); + bool GetColumnBinding(const Expression &expr, ColumnBinding &binding); + bool GetNullPreservingColumnBinding(const Expression &expr, ColumnBinding &binding); + + //! Track predicates that require columns to be NULL or reject NULL values void HandleExpression(const Expression &expr); + void HandleFilterExpression(const Expression &expr); + bool HandleIsNullExpression(const Expression &expr); + bool IsNullFilter(const Expression &expr, ColumnBinding &binding); + void InitializeRequiredColumns(LogicalOperator &op); + bool FiltersNulls(ExpressionType comparison_type); + + //! Rewrite joins based on the NULL constraints collected from operators above them + bool TryConvertLeftToAntiJoin(LogicalComparisonJoin &join); + bool HasNullRequiredColumns(const vector &bindings); + bool HasRequiredColumns(const vector &bindings); + void MarkEliminatedNullColumns(const vector &bindings); + vector GetRightBindings(LogicalJoin &join); + void SimplifyOuterJoinType(LogicalComparisonJoin &join); + + //! Visit operators while preserving only the constraints that can safely pass through each operator type + void VisitComparisonJoin(LogicalComparisonJoin &join, LogicalOperator &op); + void VisitInnerOrSemiJoin(LogicalComparisonJoin &join, LogicalOperator &op); + void VisitOuterJoin(LogicalComparisonJoin &join, LogicalOperator &op); + void VisitProjection(LogicalProjection &projection, LogicalOperator &op); + void VisitFilter(LogicalFilter &filter, LogicalOperator &op); + void VisitOrder(LogicalOrder &order, LogicalOperator &op); + void VisitTopN(LogicalTopN &top_n, LogicalOperator &op); + void VisitAggregate(LogicalAggregate &aggregate, LogicalOperator &op); + void VisitUnsupportedOperator(LogicalOperator &op); private: //! Columns that have their NULL values filtered column_binding_set_t null_filtered_columns; + //! Columns that are required to be NULL + column_binding_set_t null_required_columns; + //! Columns that are known to be NULL after rewriting a LEFT join to an ANTI join + column_binding_set_t eliminated_null_columns; + //! Columns that must remain available to operators above the current point in the plan + column_binding_set_t required_columns; + bool initialized_required_columns = false; }; } // namespace duckdb diff --git a/src/duckdb/src/include/duckdb/parser/parser.hpp b/src/duckdb/src/include/duckdb/parser/parser.hpp index 1864a580d..f90862e12 100644 --- a/src/duckdb/src/include/duckdb/parser/parser.hpp +++ b/src/duckdb/src/include/duckdb/parser/parser.hpp @@ -57,6 +57,14 @@ class Parser { //! using `stmt->stmt_location` / `stmt->stmt_length` if needed. DUCKDB_API unique_ptr ParseTopLevelStatement(vector &tokens, idx_t &token_cursor); + //! Run the `parse_function` extensions over the tail of `query` starting at `token_cursor`, + //! the way `ParseQuery` does in its catch handler. Returns the produced `ExtensionStatement` + //! and advances `token_cursor` past the bytes the extension claimed. Returns nullptr if no + //! extension claims the segment. Used by both `ParseQuery` and the lazy `ParseIterator` + //! so the two paths handle PEG failures identically. + DUCKDB_API unique_ptr TryParseExtensionStatement(vector &tokens, idx_t &token_cursor, + const string &query); + //! Tokenize a query, returning the raw tokens together with their locations static vector Tokenize(const string &query); @@ -90,6 +98,10 @@ class Parser { static bool StripUnicodeSpaces(const string &query_str, string &new_query); + //! Normalize a query string before parsing: validate UTF-8 (throws on invalid), then strip + //! non-ASCII Unicode spaces + static string NormalizeSQLString(const string &query); + void ThrowParserOverrideError(ParserOverrideResult &result); private: diff --git a/src/duckdb/src/include/duckdb/parser/peg/inlined_grammar.hpp b/src/duckdb/src/include/duckdb/parser/peg/inlined_grammar.hpp index 368f7621b..6f0cd65d1 100644 --- a/src/duckdb/src/include/duckdb/parser/peg/inlined_grammar.hpp +++ b/src/duckdb/src/include/duckdb/parser/peg/inlined_grammar.hpp @@ -1302,9 +1302,11 @@ const char INLINED_PEG_GRAMMAR[] = { "ByName <- 'BY' 'NAME'\n" "SelectStatementType <- OptionalParensSimpleSelect / ValuesClause / DescribeStatement / TableStatement / PivotStatement / UnpivotStatement\n" "ResultModifiers <- OrderByClause? LimitOffset?\n" - "LimitOffset <- LimitOffsetClause / OffsetLimitClause\n" + "LimitOffset <- LimitOffsetClause / OffsetFetchClause / OffsetLimitClause / FetchOnlyClause\n" "LimitOffsetClause <- LimitClause OffsetClause?\n" "OffsetLimitClause <- OffsetClause LimitClause?\n" + "OffsetFetchClause <- OffsetClause FetchClause\n" + "FetchOnlyClause <- FetchClause\n" "TableStatement <- 'TABLE' BaseTableName\n" "OptionalParensSimpleSelect <- SimpleSelectParens / SimpleSelect\n" "SimpleSelectParens <- Parens(SimpleSelect)\n" @@ -1442,6 +1444,9 @@ const char INLINED_PEG_GRAMMAR[] = { "LimitAll <- 'ALL'\n" "LimitLiteralPercent <- NumberLiteral 'PERCENT'\n" "LimitExpression <- Expression '%'?\n" + "FetchClause <- 'FETCH' FirstOrNext FetchValue RowOrRows 'ONLY'\n" + "FirstOrNext <- 'FIRST' / 'NEXT'\n" + "FetchValue <- Expression\n" "AliasedExpression <- ColIdExpression / ExpressionAsCollabel / ExpressionOptIdentifier\n" "ColIdExpression <- ColId ':' Expression\n" "ExpressionAsCollabel <- Expression 'AS' ColLabelOrString\n" diff --git a/src/duckdb/src/include/duckdb/parser/peg/tokenizer/parser_tokenizer.hpp b/src/duckdb/src/include/duckdb/parser/peg/tokenizer/parser_tokenizer.hpp index 1a2f489cf..89f9ec9f0 100644 --- a/src/duckdb/src/include/duckdb/parser/peg/tokenizer/parser_tokenizer.hpp +++ b/src/duckdb/src/include/duckdb/parser/peg/tokenizer/parser_tokenizer.hpp @@ -9,6 +9,7 @@ class ParserTokenizer : public BaseTokenizer { ParserTokenizer(const string &sql, vector &tokens); ~ParserTokenizer() override = default; + void PushToken(idx_t start, idx_t end, TokenType type, bool unterminated = false) override; void OnStatementEnd(idx_t pos) override; void OnLastToken(TokenizeState state, string last_word, idx_t last_pos) override; }; diff --git a/src/duckdb/src/include/duckdb/parser/peg/transformer/peg_transformer.hpp b/src/duckdb/src/include/duckdb/parser/peg/transformer/peg_transformer.hpp index e331b2e45..ebd96959c 100644 --- a/src/duckdb/src/include/duckdb/parser/peg/transformer/peg_transformer.hpp +++ b/src/duckdb/src/include/duckdb/parser/peg/transformer/peg_transformer.hpp @@ -3234,6 +3234,15 @@ class PEGTransformerFactory { static unique_ptr TransformOffsetLimitClause(PEGTransformer &transformer, LimitPercentResult offset_clause, optional limit_clause); + static unique_ptr TransformOffsetFetchClauseInternal(PEGTransformer &transformer, + ParseResult &parse_result); + static unique_ptr TransformOffsetFetchClause(PEGTransformer &transformer, + LimitPercentResult offset_clause, + LimitPercentResult fetch_clause); + static unique_ptr TransformFetchOnlyClauseInternal(PEGTransformer &transformer, + ParseResult &parse_result); + static unique_ptr TransformFetchOnlyClause(PEGTransformer &transformer, + LimitPercentResult fetch_clause); static unique_ptr TransformTableStatementInternal(PEGTransformer &transformer, ParseResult &parse_result); static unique_ptr TransformTableStatement(PEGTransformer &transformer, @@ -3698,6 +3707,11 @@ class PEGTransformerFactory { ParseResult &parse_result); static LimitPercentResult TransformLimitExpression(PEGTransformer &transformer, unique_ptr expression, const bool &has_result); + static unique_ptr TransformFetchClauseInternal(PEGTransformer &transformer, + ParseResult &parse_result); + static unique_ptr TransformFetchValueInternal(PEGTransformer &transformer, + ParseResult &parse_result); + static LimitPercentResult TransformFetchValue(PEGTransformer &transformer, unique_ptr expression); static unique_ptr TransformAliasedExpressionInternal(PEGTransformer &transformer, ParseResult &parse_result); static unique_ptr TransformColIdExpressionInternal(PEGTransformer &transformer, diff --git a/src/duckdb/src/include/duckdb/storage/statistics/geometry_stats.hpp b/src/duckdb/src/include/duckdb/storage/statistics/geometry_stats.hpp index dd5833b3a..f457fc624 100644 --- a/src/duckdb/src/include/duckdb/storage/statistics/geometry_stats.hpp +++ b/src/duckdb/src/include/duckdb/storage/statistics/geometry_stats.hpp @@ -341,10 +341,6 @@ struct GeometryStats { DUCKDB_API static void Verify(const BaseStatistics &stats, const Vector &vector, const SelectionVector &sel, idx_t count); - //! Check if a spatial predicate check with a constant could possibly be satisfied by rows given the statistics - DUCKDB_API static FilterPropagateResult CheckZonemap(const BaseStatistics &stats, - const unique_ptr &expr); - DUCKDB_API static GeometryExtent &GetExtent(BaseStatistics &stats); DUCKDB_API static const GeometryExtent &GetExtent(const BaseStatistics &stats); DUCKDB_API static GeometryTypeSet &GetTypes(BaseStatistics &stats); diff --git a/src/duckdb/src/main/client_context.cpp b/src/duckdb/src/main/client_context.cpp index 3f59113f5..395450f9e 100644 --- a/src/duckdb/src/main/client_context.cpp +++ b/src/duckdb/src/main/client_context.cpp @@ -19,7 +19,9 @@ #include "duckdb/main/client_data.hpp" #include "duckdb/main/database.hpp" #include "duckdb/main/database_manager.hpp" +#include "duckdb/main/statement_iterator.hpp" #include "duckdb/main/error_manager.hpp" +#include "duckdb/main/parse_iterator.hpp" #include "duckdb/main/materialized_query_result.hpp" #include "duckdb/main/query_profiler.hpp" #include "duckdb/main/query_result.hpp" @@ -46,8 +48,10 @@ #include "duckdb/parser/statement/relation_statement.hpp" #include "duckdb/parser/statement/select_statement.hpp" #include "duckdb/parser/tableref/column_data_ref.hpp" +#include "duckdb/planner/binder.hpp" #include "duckdb/planner/operator/logical_execute.hpp" #include "duckdb/planner/planner.hpp" +#include "duckdb/common/enums/current_transaction_state.hpp" #include "duckdb/planner/statement_preprocessor.hpp" #include "duckdb/storage/data_table.hpp" #include "duckdb/transaction/meta_transaction.hpp" @@ -785,25 +789,44 @@ void ClientContext::InitialCleanup(ClientContextLock &lock) { interrupt_state = ClientInterruptState::NOT_INTERRUPTED; } -vector> ClientContext::ParseStatements(const string &query) { - auto lock = LockContext(); - return ParseStatementsInternal(*lock, query); +StatementIterator ClientContext::IterateStatements(const string &query) { + // The iterator yields ready-to-execute (engine-facing) statements: PRAGMA reparse, + // MULTI_STATEMENT unpack and transaction wrapping per peel — matches the eager API users expect. + // Callers that want raw parse-facing statements and drive their own preprocessing construct a + // ParseIterator directly (e.g. Query / ParseStatementsInternal below, which hold the lock). + return StatementIterator(ParseIterator(*this, query)); } -vector> ClientContext::ParseStatementsInternal(ClientContextLock &lock, const string &query) { - try { - Parser parser(GetParserOptions()); - auto &profiler = QueryProfiler::Get(*this); - profiler.StartQuery(query); - auto parser_timer = profiler.StartTimer(); - parser.ParseQuery(query); - - StatementPreprocessor preprocessor(*this); - const CurrentTransactionState transaction_context_state = - transaction.HasActiveTransaction() ? IN_ACTIVE_TRANSACTION : NOT_IN_ACTIVE_TRANSACTION; - preprocessor.Preprocess(lock, parser.statements, transaction_context_state); +void ClientContext::PreprocessStatements(vector> &buffer, + optional_ptr lock) { + // Acquire our own lock if the caller doesn't hold one (e.g. the shell); own_lock keeps it alive + // for the duration of the preprocess pass. + unique_ptr own_lock; + if (!lock) { + own_lock = LockContext(); + lock = own_lock.get(); + } + StatementPreprocessor preprocessor(*this); + const CurrentTransactionState transaction_state = + transaction.HasActiveTransaction() ? IN_ACTIVE_TRANSACTION : NOT_IN_ACTIVE_TRANSACTION; + preprocessor.Preprocess(*lock, buffer, transaction_state); +} - return std::move(parser.statements); +vector> ClientContext::ParseStatementsInternal(ClientContextLock &lock, const string &query) { + try { + QueryProfiler::Get(*this).StartQuery(query); + + // Drain the lazy iterator into a vector for callers that want the eager shape. + StatementIterator iterator {ParseIterator(*this, query)}; + vector> result; + while (iterator.Peek()) { + auto stmt = iterator.GetStatementWithLock(lock); + if (!stmt) { + continue; // a peel that preprocessing swallowed + } + result.push_back(std::move(stmt)); + } + return result; } catch (std::exception &ex) { auto error = ErrorData(ex); ProcessError(error, query); @@ -811,15 +834,6 @@ vector> ClientContext::ParseStatementsInternal(ClientCo } } -void ClientContext::PreprocessStatements(vector> &statements) { - auto lock = LockContext(); - - StatementPreprocessor preprocessor(*this); - const CurrentTransactionState transaction_context_state = - transaction.HasActiveTransaction() ? IN_ACTIVE_TRANSACTION : NOT_IN_ACTIVE_TRANSACTION; - preprocessor.Preprocess(*lock, statements, transaction_context_state); -} - unique_ptr ClientContext::ExtractPlan(const string &query) { auto lock = LockContext(); @@ -881,6 +895,50 @@ unique_ptr ClientContext::Prepare(unique_ptr st } } +StatementSignature ClientContext::BindStatement(unique_ptr statement) { + auto lock = LockContext(); + auto named_param_map = statement->named_param_map; + StatementSignature signature; + ErrorData bind_error; + RunFunctionInTransactionInternal( + *lock, + [&]() { + try { + Planner planner(*this); + planner.CreatePlan(std::move(statement)); + signature.names = planner.names; + signature.types = planner.types; + signature.properties = std::move(planner.properties); + // Parameter types from the bound parameter map (as in PreparedStatementData::TryGetType). + // An un-anchored parameter (e.g. SELECT $1) gets no value_map entry, so its + // type stays UNKNOWN; do not assume every parameter is present. + for (auto &entry : named_param_map) { + LogicalType type(LogicalTypeId::UNKNOWN); + auto it = planner.value_map.find(entry.first); + if (it != planner.value_map.end()) { + type = it->second->return_type.id() != LogicalTypeId::INVALID ? it->second->return_type + : it->second->GetValue().type(); + } + signature.parameters.push_back({entry.first, entry.second, std::move(type)}); + } + } catch (const std::exception &ex) { + ErrorData error(ex); + // Binding is read-only: a recoverable bind error changed nothing, so leave + // the caller's transaction intact (rethrow after the wrapper). A database- + // fatal error still propagates so invalidation runs as usual. + if (Exception::InvalidatesDatabase(error.Type())) { + throw; + } + bind_error = std::move(error); + } + }, + false); + if (bind_error.HasError()) { + bind_error.Throw(); + } + return signature; +} + unique_ptr ClientContext::Prepare(const string &query) { auto lock = LockContext(); // prepare the query @@ -1105,13 +1163,37 @@ unique_ptr ClientContext::Query(unique_ptr statement, unique_ptr ClientContext::Query(const string &query, QueryParameters query_parameters) { auto lock = LockContext(); - vector> statements; + // The lazy path bypasses ParseStatementsInternal → InitialCleanup, so clear leftover query state + // (interrupt flag, etc.) ourselves. + InitialCleanup(*lock); + auto &profiler = QueryProfiler::Get(*this); + profiler.StartQuery(query); + // ParseIterator's constructor runs UTF-8 validation / Unicode-space strip and can throw — route + // through ErrorResult so the source location attaches the same way Peek failures do. + optional_ptr iterator_ptr; + unique_ptr iterator_storage; try { - statements = ParseStatements(*lock, query); + iterator_storage = make_uniq(ParseIterator(*this, query)); + iterator_ptr = *iterator_storage; } catch (const std::exception &ex) { return ErrorResult(ErrorData(ex), query); } - if (statements.empty()) { + auto &iterator = *iterator_ptr; + + auto peek_or_error = [&](bool &has_now) -> unique_ptr { + try { + has_now = iterator.Peek(); + return nullptr; + } catch (const std::exception &ex) { + return ErrorResult(ErrorData(ex), query); + } + }; + + bool has_current = false; + if (auto error = peek_or_error(has_current)) { + return error; + } + if (!has_current) { // no statements, return empty successful result StatementProperties properties; vector names; @@ -1123,57 +1205,67 @@ unique_ptr ClientContext::Query(const string &query, QueryParameter unique_ptr result; optional_ptr last_result; bool last_had_result = false; - for (idx_t i = 0; i < statements.size(); i++) { - auto &statement = statements[i]; - bool is_last_statement = i + 1 == statements.size(); - PendingQueryParameters parameters; - parameters.query_parameters = query_parameters; - if (!is_last_statement) { - parameters.query_parameters.output_type = QueryResultOutputType::FORCE_MATERIALIZED; - } - auto pending_query = PendingQueryInternal(*lock, std::move(statement), parameters); - auto has_result = pending_query->properties.return_type == StatementReturnType::QUERY_RESULT; - unique_ptr current_result; - if (pending_query->HasError()) { - current_result = ErrorResult(pending_query->GetErrorObject()); - } else { - current_result = ExecutePendingQueryInternal(*lock, *pending_query); + while (has_current) { + // Get + preprocess the next engine-facing statement, reusing the lock we already hold. PRAGMA + // reparse / MULTI_STATEMENT unpacking happen inside GetStatementWithLock, which sees the + // transaction state left by the previously executed statement. A peel can preprocess to + // nothing, in which case statement is null and there is nothing to execute. + unique_ptr statement; + try { + statement = iterator.GetStatementWithLock(*lock); + } catch (const std::exception &ex) { + return ErrorResult(ErrorData(ex), query); } - if (current_result->HasError()) { - if (transaction.HasActiveTransaction() && transaction.GetAutoRollback()) { - transaction.Rollback(current_result->GetErrorObject()); + + // Look ahead WITHOUT parsing: HasMore() only walks the token cursor, so it never parses (and + // never throws) the next statement here. The next statement is parsed later, in this loop's + // next GetStatementWithLock — after the current statement has executed. This lets a statement + // register grammar (e.g. LOAD an extension) that a following statement then uses. + bool has_next = iterator.HasMore(); + + if (statement) { + bool is_last_overall = !has_next; + PendingQueryParameters parameters; + parameters.query_parameters = query_parameters; + if (!is_last_overall) { + parameters.query_parameters.output_type = QueryResultOutputType::FORCE_MATERIALIZED; } - // Reset the interrupted flag, this was set by the task that found the error - // Next statements should not be bothered by that interruption - interrupt_state = ClientInterruptState::NOT_INTERRUPTED; - return current_result; - } - // now append the result to the list of results - if (!last_result || !last_had_result) { - // first result of the query - result = std::move(current_result); - last_result = result.get(); - last_had_result = has_result; - } else { - // later results; attach to the result chain - // but only if there is a result - if (!has_result) { - continue; + auto pending_query = PendingQueryInternal(*lock, std::move(statement), parameters); + auto has_result = pending_query->properties.return_type == StatementReturnType::QUERY_RESULT; + unique_ptr current_result; + if (pending_query->HasError()) { + current_result = ErrorResult(pending_query->GetErrorObject()); + } else { + current_result = ExecutePendingQueryInternal(*lock, *pending_query); + } + if (current_result->HasError()) { + if (transaction.HasActiveTransaction() && transaction.GetAutoRollback()) { + transaction.Rollback(current_result->GetErrorObject()); + } + // Reset the interrupted flag, this was set by the task that found the error + // Next statements should not be bothered by that interruption + interrupt_state = ClientInterruptState::NOT_INTERRUPTED; + return current_result; + } + // now append the result to the list of results + if (!last_result || !last_had_result) { + // first result of the query + result = std::move(current_result); + last_result = result.get(); + last_had_result = has_result; + } else if (has_result) { + // later results; attach to the result chain, but only if there is a result + last_result->next = std::move(current_result); + last_result = last_result->next.get(); } - last_result->next = std::move(current_result); - last_result = last_result->next.get(); + D_ASSERT(last_result); } - D_ASSERT(last_result); + + has_current = has_next; } return result; } -vector> ClientContext::ParseStatements(ClientContextLock &lock, const string &query) { - InitialCleanup(lock); - // parse the query and transform it into a set of statements - return ParseStatementsInternal(lock, query); -} - unique_ptr ClientContext::PendingQuery(const string &query, QueryParameters parameters) { identifier_map_t empty_param_list; return PendingQuery(query, empty_param_list, parameters); @@ -1427,6 +1519,9 @@ void ClientContext::TryBindRelation(Relation &relation, vector unordered_set ClientContext::GetTableNames(const string &query, const bool qualified) { auto lock = LockContext(); + // Preprocess before binding so PRAGMA reparse / macro expansion happens up front — GetTableNames + // extracts names from the *underlying* query (e.g. `PRAGMA tpch(1)` -> the TPC-H SELECT, whose + // tables are what the caller wants). A raw, un-preprocessed PRAGMA would never surface them. auto statements = ParseStatementsInternal(*lock, query); if (statements.size() != 1) { throw InvalidInputException("Expected a single statement"); diff --git a/src/duckdb/src/main/connection.cpp b/src/duckdb/src/main/connection.cpp index 63e15c1e1..c5bf25197 100644 --- a/src/duckdb/src/main/connection.cpp +++ b/src/duckdb/src/main/connection.cpp @@ -183,7 +183,17 @@ unique_ptr Connection::TableInfo(const Identifier &table_name) } vector> Connection::ExtractStatements(const string &query) { - return context->ParseStatements(query); + // Eager convenience over the lazy ClientContext::ExtractStatements iterator: drain the + // engine-facing statements into a vector. + auto &client_context = *context; + auto iterator = client_context.IterateStatements(query); + vector> result; + while (iterator.Peek()) { + if (auto statement = iterator.GetStatement()) { + result.push_back(std::move(statement)); + } + } + return result; } unique_ptr Connection::ExtractPlan(const string &query) { diff --git a/src/duckdb/src/main/parse_iterator.cpp b/src/duckdb/src/main/parse_iterator.cpp new file mode 100644 index 000000000..a235ff3ae --- /dev/null +++ b/src/duckdb/src/main/parse_iterator.cpp @@ -0,0 +1,184 @@ +#include "duckdb/main/parse_iterator.hpp" + +#include "duckdb/main/client_context.hpp" +#include "duckdb/main/extension_callback_manager.hpp" +#include "duckdb/main/query_profiler.hpp" +#include "duckdb/parser/parser.hpp" +#include "duckdb/parser/parser_extension.hpp" +#include "duckdb/parser/peg/matcher.hpp" +#include "duckdb/parser/peg/tokenizer/parser_tokenizer.hpp" +#include "duckdb/parser/sql_statement.hpp" +#include "duckdb/parser/statement/create_statement.hpp" +#include "duckdb/parser/parsed_data/create_info.hpp" + +namespace duckdb { + +ParseIterator::ParseIterator(ClientContext &context_p, const string &sql_p) + : context(context_p), sql(Parser::NormalizeSQLString(sql_p)) { +} + +ParseIterator::~ParseIterator() = default; + +ParseIterator::ParseIterator(ParseIterator &&) noexcept = default; + +ClientContext &ParseIterator::GetClientContext() { + return context; +} + +bool ParseIterator::Peek() { + auto &client_context = context; + // Already buffered from a prior Peek — just report it. + if (current_statement) { + return true; + } + if (exhausted) { + return false; + } + // Charge the time spent tokenizing/parsing on this Peek to MetricParserTotalTime so callers + // get parse metrics without each having to remember to wrap us in a timer. + auto parser_timer = QueryProfiler::Get(client_context).StartTimer(); + auto options = client_context.GetParserOptions(); + // On the very first Peek, give `parser_override` extensions a chance to claim the whole + // query. If one does, we yield its statements one at a time and skip the PEG path entirely. + if (!override_resolved) { + override_resolved = true; + if (options.extensions) { + bool has_strict_extension_error = false; + ErrorData last_strict_extension_error; + for (auto &ext : options.extensions->ParserExtensions()) { + if (!ext.parser_override) { + continue; + } + if (options.parser_override_setting == AllowParserOverride::DEFAULT_OVERRIDE) { + continue; + } + auto result = ext.parser_override(ext.parser_info.get(), sql, options); + if (result.type == ParserExtensionResultType::PARSE_SUCCESSFUL) { + overridden_statements = make_uniq>>(std::move(result.statements)); + break; + } + if (options.parser_override_setting == AllowParserOverride::STRICT_OVERRIDE) { + if (result.type == ParserExtensionResultType::DISPLAY_EXTENSION_ERROR) { + has_strict_extension_error = true; + last_strict_extension_error = std::move(result.error); + } else { + has_strict_extension_error = false; + } + continue; + } + } + if (!overridden_statements && options.parser_override_setting == AllowParserOverride::STRICT_OVERRIDE && + has_strict_extension_error) { + last_strict_extension_error.Throw(); + } + } + } + if (overridden_statements) { + if (override_cursor >= overridden_statements->size()) { + exhausted = true; + return false; + } + current_statement = std::move((*overridden_statements)[override_cursor++]); + return true; + } + if (!parser) { + parser = make_uniq(options); + } + EnsureTokenized(); + // Walk the token cursor through the cached `tokens`, calling Parser::ParseTopLevelStatement + // repeatedly. A nullptr return with cursor advanced means a separator-only TopLevelStatement + // (e.g. between statements or trailing ';'s); we loop past it. A nullptr return with cursor + // at end means the input is exhausted. + auto at_end_of_real_tokens = [&]() { + return token_cursor >= tokens->size() || (*tokens)[token_cursor].type == TokenType::END_OF_INPUT; + }; + while (true) { + if (at_end_of_real_tokens()) { + exhausted = true; + return false; + } + unique_ptr stmt; + try { + stmt = parser->ParseTopLevelStatement(*tokens, token_cursor); + } catch (ParserException &) { + // Mirror Parser::ParseQuery's parse_function-extension fallback so extensions like + // `quack` can claim a segment that PEG couldn't parse. + stmt = parser->TryParseExtensionStatement(*tokens, token_cursor, sql); + if (!stmt) { + throw; + } + } + if (stmt) { + // ParseTopLevelStatement doesn't populate stmt->query (it operates on tokens, not the + // source string). Mirror Parser::ParseQuery's per-statement post-processing: span from + // the statement's start to the next statement's start (or end of input) so the trailing + // `;` and inter-statement whitespace end up inside stmt->query — downstream consumers + // (logging, error reporting, EXPLAIN) rely on that shape. + idx_t stmt_loc = stmt->stmt_location; + idx_t end_loc = sql.size(); + if (token_cursor < tokens->size() && (*tokens)[token_cursor].type != TokenType::END_OF_INPUT) { + end_loc = (*tokens)[token_cursor].offset; + } + stmt->query = sql.substr(stmt_loc, end_loc - stmt_loc); + stmt->stmt_location = 0; + stmt->stmt_length = stmt->query.size(); + if (stmt->type == StatementType::CREATE_STATEMENT) { + auto &create = stmt->Cast(); + create.info->sql = stmt->query; + } + current_statement = std::move(stmt); + return true; + } + if (at_end_of_real_tokens()) { + exhausted = true; + return false; + } + // separator-only TLS in the middle of the input — loop and try the next. + } +} + +void ParseIterator::EnsureTokenized() { + if (!tokens) { + // Tokenize the full input once. Subsequent Peek/HasMore calls walk through `tokens` via + // `token_cursor`; we never re-tokenize. Tokenization is grammar-free. + tokens = make_uniq>(); + ParserTokenizer tokenizer(sql, *tokens); + tokenizer.TokenizeInput(); + } +} + +bool ParseIterator::HasMore() { + // A statement is already parsed and buffered by a prior Peek. + if (current_statement) { + return true; + } + if (exhausted) { + return false; + } + // parser_override path: yield remaining overridden statements. + if (overridden_statements) { + return override_cursor < overridden_statements->size(); + } + // PEG path: walk the token cursor without parsing. There is another statement iff a real token + // (neither a `;` separator nor the end-of-input sentinel) remains ahead of the cursor. + EnsureTokenized(); + for (idx_t i = token_cursor; i < tokens->size(); i++) { + const auto type = (*tokens)[i].type; + if (type == TokenType::END_OF_INPUT) { + return false; + } + if (type != TokenType::TERMINATOR) { + return true; + } + } + return false; +} + +unique_ptr ParseIterator::GetStatement() { + if (!current_statement) { + return nullptr; + } + return std::move(current_statement); +} + +} // namespace duckdb diff --git a/src/duckdb/src/main/statement_iterator.cpp b/src/duckdb/src/main/statement_iterator.cpp new file mode 100644 index 000000000..e73525cc8 --- /dev/null +++ b/src/duckdb/src/main/statement_iterator.cpp @@ -0,0 +1,67 @@ +#include "duckdb/main/statement_iterator.hpp" + +#include "duckdb/main/client_context.hpp" +#include "duckdb/parser/sql_statement.hpp" + +namespace duckdb { + +StatementIterator::StatementIterator(ParseIterator &&parse_iterator) + : source(std::move(parse_iterator)), context(source.GetClientContext()) { +} + +StatementIterator::~StatementIterator() = default; + +StatementIterator::StatementIterator(StatementIterator &&) noexcept = default; + +bool StatementIterator::Peek() { + // More buffered engine statements from the current peel's expansion? + if (buffer_cursor < buffer.size()) { + return true; + } + // Otherwise, is there another parse-facing statement to pull? Parses ahead, does NOT preprocess + // — safe to use as a lookahead. + return source.Peek(); +} + +bool StatementIterator::HasMore() { + // Buffered engine statements from the current peel still remain? + if (buffer_cursor < buffer.size()) { + return true; + } + // Otherwise defer to the parse-facing source's grammar-free existence check. + return source.HasMore(); +} + +unique_ptr StatementIterator::GetStatementInternal(optional_ptr lock) { + // Drain the current peel's expansion first. + if (buffer_cursor < buffer.size()) { + return std::move(buffer[buffer_cursor++]); + } + // Pull the next parse-facing statement. + if (!source.Peek()) { + return nullptr; // exhausted + } + auto stmt = source.GetStatement(); + buffer.clear(); + buffer_cursor = 0; + buffer.push_back(std::move(stmt)); + // Preprocess the peel into one-or-more engine-facing statements. This runs in Get (not Peek) so it + // sees the transaction state left by the previously executed statement. + context.PreprocessStatements(buffer, lock); + if (buffer.empty()) { + // Preprocessing swallowed the peel — caller skips with `continue`; the next Get pulls on. + return nullptr; + } + buffer_cursor = 1; + return std::move(buffer[0]); +} + +unique_ptr StatementIterator::GetStatement() { + return GetStatementInternal(nullptr); +} + +unique_ptr StatementIterator::GetStatementWithLock(ClientContextLock &lock) { + return GetStatementInternal(&lock); +} + +} // namespace duckdb diff --git a/src/duckdb/src/optimizer/outer_join_simplification.cpp b/src/duckdb/src/optimizer/outer_join_simplification.cpp index e07921b55..bd253c14d 100644 --- a/src/duckdb/src/optimizer/outer_join_simplification.cpp +++ b/src/duckdb/src/optimizer/outer_join_simplification.cpp @@ -1,138 +1,449 @@ #include "duckdb/optimizer/outer_join_simplification.hpp" #include "duckdb/planner/operator/list.hpp" +#include "duckdb/planner/expression/bound_cast_expression.hpp" +#include "duckdb/planner/expression/bound_columnref_expression.hpp" #include "duckdb/planner/expression/bound_comparison_expression.hpp" +#include "duckdb/planner/expression/bound_constant_expression.hpp" #include "duckdb/planner/expression/bound_operator_expression.hpp" #include "duckdb/planner/expression/bound_function_expression.hpp" +#include "duckdb/planner/expression_iterator.hpp" namespace duckdb { +//===--------------------------------------------------------------------===// +// Setup +//===--------------------------------------------------------------------===// + OuterJoinSimplification::OuterJoinSimplification() { } -void OuterJoinSimplification::HandleExpression(const Expression &expr) { - // TODO: could unwrap casts or basic arithmetic +OuterJoinSimplification::OuterJoinSimplification(column_binding_set_t required_columns_p) + : required_columns(std::move(required_columns_p)), initialized_required_columns(true) { +} + +//===--------------------------------------------------------------------===// +// Column Binding Helpers +//===--------------------------------------------------------------------===// + +void OuterJoinSimplification::AddColumnReferences(const Expression &expr, column_binding_set_t &bindings) { + ExpressionIterator::VisitExpression(expr, [&](const BoundColumnRefExpression &colref) { + if (colref.Depth() == 0) { + bindings.insert(colref.Binding()); + } + }); +} + +void OuterJoinSimplification::AddRequiredColumns(const Expression &expr) { + AddColumnReferences(expr, required_columns); +} + +void OuterJoinSimplification::AddRequiredColumns(const JoinCondition &condition) { + if (condition.IsComparison()) { + AddRequiredColumns(condition.GetLHS()); + AddRequiredColumns(condition.GetRHS()); + } else { + AddRequiredColumns(condition.GetJoinExpression()); + } +} + +void OuterJoinSimplification::AddRequiredColumns(const vector &conditions) { + for (const auto &condition : conditions) { + AddRequiredColumns(condition); + } +} + +bool OuterJoinSimplification::GetColumnBinding(const Expression &expr, ColumnBinding &binding) { if (expr.GetExpressionClass() != ExpressionClass::BOUND_COLUMN_REF) { - return; + if (expr.GetExpressionClass() != ExpressionClass::BOUND_CAST) { + return false; + } + auto &cast = expr.Cast(); + return GetColumnBinding(cast.Child(), binding); } auto &colref = expr.Cast(); - null_filtered_columns.insert(colref.Binding()); + binding = colref.Binding(); + return true; } -void OuterJoinSimplification::VisitOperator(LogicalOperator &op) { - switch (op.type) { - case LogicalOperatorType::LOGICAL_COMPARISON_JOIN: { - auto &join = op.Cast(); - switch (join.join_type) { - case JoinType::INNER: - case JoinType::SEMI: { - // Derive bindings that cannot be NULL - for (const auto &condition : join.conditions) { - if (!condition.IsComparison()) { - continue; // Non-comparison predicate, bail - } - if (condition.GetComparisonType() == ExpressionType::COMPARE_DISTINCT_FROM || - condition.GetComparisonType() == ExpressionType::COMPARE_NOT_DISTINCT_FROM) { - continue; // Predicate does not filter NULL values - } - HandleExpression(condition.GetLHS()); - HandleExpression(condition.GetRHS()); - } - VisitOperatorChildren(op); - return; - } - case JoinType::LEFT: - case JoinType::RIGHT: - case JoinType::OUTER: { - // Try to simplify joins - bool preserves_null_extended_rows[2] = { - join.join_type == JoinType::LEFT || join.join_type == JoinType::OUTER, - join.join_type == JoinType::RIGHT || join.join_type == JoinType::OUTER}; - for (idx_t child_idx = 0; child_idx < 2; child_idx++) { - for (const auto &binding : join.children[child_idx]->GetColumnBindings()) { - if (null_filtered_columns.find(binding) != null_filtered_columns.end()) { - // Rejecting NULLS in one child removes preservation of NULL extended rows for the other child - preserves_null_extended_rows[1 - child_idx] = false; - break; - } - } - } +bool OuterJoinSimplification::GetNullPreservingColumnBinding(const Expression &expr, ColumnBinding &binding) { + if (expr.GetExpressionClass() != ExpressionClass::BOUND_COLUMN_REF) { + if (expr.GetExpressionClass() != ExpressionClass::BOUND_CAST) { + return false; + } + auto &cast = expr.Cast(); + if (cast.IsTryCast() || !BoundCastExpression::CastIsInvertible(cast.source_type(), cast.GetReturnType())) { + return false; + } + return GetNullPreservingColumnBinding(cast.Child(), binding); + } + auto &colref = expr.Cast(); + binding = colref.Binding(); + return true; +} - if (!preserves_null_extended_rows[0] && !preserves_null_extended_rows[1]) { - join.join_type = JoinType::INNER; - VisitOperator(op); // Re-enter because we just created another (NULL-filtering!) INNER join - return; - } +//===--------------------------------------------------------------------===// +// NULL Constraint Tracking +//===--------------------------------------------------------------------===// - if (preserves_null_extended_rows[0] && !preserves_null_extended_rows[1]) { - D_ASSERT(join.join_type == JoinType::LEFT || join.join_type == JoinType::OUTER); - join.join_type = JoinType::LEFT; - } else if (!preserves_null_extended_rows[0] && preserves_null_extended_rows[1]) { - D_ASSERT(join.join_type == JoinType::RIGHT || join.join_type == JoinType::OUTER); - join.join_type = JoinType::RIGHT; - } else { - D_ASSERT(join.join_type == JoinType::OUTER); - join.join_type = JoinType::OUTER; - } +void OuterJoinSimplification::HandleExpression(const Expression &expr) { + ColumnBinding binding; + if (GetColumnBinding(expr, binding)) { + null_filtered_columns.insert(binding); + } +} + +void OuterJoinSimplification::HandleFilterExpression(const Expression &expr) { + if (expr.GetExpressionClass() == ExpressionClass::BOUND_OPERATOR && + expr.GetExpressionType() == ExpressionType::OPERATOR_IS_NOT_NULL) { + const auto &is_not_null = expr.Cast(); + HandleExpression(*is_not_null.GetChildren()[0]); + AddRequiredColumns(expr); + return; + } - VisitOperatorChildren(op); - return; + if (expr.GetExpressionClass() == ExpressionClass::BOUND_OPERATOR && + expr.GetExpressionType() == ExpressionType::OPERATOR_IS_NULL) { + const auto &is_null = expr.Cast(); + if (!HandleIsNullExpression(*is_null.GetChildren()[0])) { + AddRequiredColumns(expr); } - default: - // Passthrough not supported. - break; + return; + } + + if (!BoundComparisonExpression::IsComparison(expr)) { + AddRequiredColumns(expr); + return; + } + + AddRequiredColumns(expr); + if (!FiltersNulls(expr.GetExpressionType())) { + return; + } + const auto &comparison = expr.Cast(); + HandleExpression(BoundComparisonExpression::Left(comparison)); + HandleExpression(BoundComparisonExpression::Right(comparison)); +} + +bool OuterJoinSimplification::HandleIsNullExpression(const Expression &expr) { + ColumnBinding binding; + if (GetNullPreservingColumnBinding(expr, binding)) { + null_required_columns.insert(binding); + return true; + } + return false; +} + +bool OuterJoinSimplification::IsNullFilter(const Expression &expr, ColumnBinding &binding) { + if (expr.GetExpressionClass() != ExpressionClass::BOUND_OPERATOR || + expr.GetExpressionType() != ExpressionType::OPERATOR_IS_NULL) { + return false; + } + auto &is_null = expr.Cast(); + auto &child = *is_null.GetChildren()[0]; + return GetNullPreservingColumnBinding(child, binding); +} + +void OuterJoinSimplification::InitializeRequiredColumns(LogicalOperator &op) { + if (initialized_required_columns) { + return; + } + for (const auto &binding : op.GetColumnBindings()) { + required_columns.insert(binding); + } + initialized_required_columns = true; +} + +bool OuterJoinSimplification::FiltersNulls(ExpressionType comparison_type) { + return comparison_type != ExpressionType::COMPARE_DISTINCT_FROM && + comparison_type != ExpressionType::COMPARE_NOT_DISTINCT_FROM; +} + +//===--------------------------------------------------------------------===// +// Join Rewrite Helpers +//===--------------------------------------------------------------------===// + +vector OuterJoinSimplification::GetRightBindings(LogicalJoin &join) { + return LogicalOperator::MapBindings(join.children[1]->GetColumnBindings(), join.right_projection_map); +} + +bool OuterJoinSimplification::HasNullRequiredColumns(const vector &bindings) { + for (const auto &binding : bindings) { + if (null_required_columns.find(binding) != null_required_columns.end()) { + return true; } - break; } - case LogicalOperatorType::LOGICAL_PROJECTION: { - // Passthrough supported. Add input bindings - auto &projection = op.Cast(); - for (idx_t col_idx = 0; col_idx < projection.expressions.size(); col_idx++) { - auto &expr = *projection.expressions[col_idx]; - if (expr.GetExpressionClass() != ExpressionClass::BOUND_COLUMN_REF) { - continue; - } - const ColumnBinding binding(projection.table_index, ProjectionIndex(col_idx)); - if (null_filtered_columns.find(binding) == null_filtered_columns.end()) { - continue; + return false; +} + +bool OuterJoinSimplification::HasRequiredColumns(const vector &bindings) { + for (const auto &binding : bindings) { + if (required_columns.find(binding) != required_columns.end()) { + return true; + } + } + return false; +} + +void OuterJoinSimplification::MarkEliminatedNullColumns(const vector &bindings) { + for (const auto &binding : bindings) { + eliminated_null_columns.insert(binding); + } +} + +bool OuterJoinSimplification::TryConvertLeftToAntiJoin(LogicalComparisonJoin &join) { + if (join.join_type != JoinType::LEFT) { + return false; + } + + auto right_bindings = GetRightBindings(join); + if (!HasNullRequiredColumns(right_bindings) || HasRequiredColumns(right_bindings)) { + return false; + } + + column_binding_set_t match_non_null_rhs; + for (const auto &condition : join.conditions) { + if (!condition.IsComparison() || !FiltersNulls(condition.GetComparisonType())) { + continue; + } + ColumnBinding rhs_binding; + if (!GetColumnBinding(condition.GetRHS(), rhs_binding)) { + continue; + } + match_non_null_rhs.insert(rhs_binding); + } + + for (const auto &binding : right_bindings) { + if (null_required_columns.find(binding) == null_required_columns.end()) { + continue; + } + if (match_non_null_rhs.find(binding) == match_non_null_rhs.end()) { + continue; + } + join.join_type = JoinType::ANTI; + MarkEliminatedNullColumns(right_bindings); + return true; + } + return false; +} + +void OuterJoinSimplification::SimplifyOuterJoinType(LogicalComparisonJoin &join) { + bool preserves_null_extended_rows[2] = {join.join_type == JoinType::LEFT || join.join_type == JoinType::OUTER, + join.join_type == JoinType::RIGHT || join.join_type == JoinType::OUTER}; + for (idx_t child_idx = 0; child_idx < 2; child_idx++) { + for (const auto &binding : join.children[child_idx]->GetColumnBindings()) { + if (null_filtered_columns.find(binding) != null_filtered_columns.end()) { + // Rejecting NULLs in one child removes preservation of NULL extended rows for the other child. + preserves_null_extended_rows[1 - child_idx] = false; + break; } - null_filtered_columns.insert(expr.Cast().Binding()); } + } + + if (!preserves_null_extended_rows[0] && !preserves_null_extended_rows[1]) { + join.join_type = JoinType::INNER; + return; + } + + if (preserves_null_extended_rows[0] && !preserves_null_extended_rows[1]) { + D_ASSERT(join.join_type == JoinType::LEFT || join.join_type == JoinType::OUTER); + join.join_type = JoinType::LEFT; + } else if (!preserves_null_extended_rows[0] && preserves_null_extended_rows[1]) { + D_ASSERT(join.join_type == JoinType::RIGHT || join.join_type == JoinType::OUTER); + join.join_type = JoinType::RIGHT; + } else { + D_ASSERT(join.join_type == JoinType::OUTER); + join.join_type = JoinType::OUTER; + } +} + +//===--------------------------------------------------------------------===// +// Operator Visitors +//===--------------------------------------------------------------------===// + +void OuterJoinSimplification::VisitComparisonJoin(LogicalComparisonJoin &join, LogicalOperator &op) { + switch (join.join_type) { + case JoinType::INNER: + case JoinType::SEMI: + VisitInnerOrSemiJoin(join, op); + return; + case JoinType::LEFT: + case JoinType::RIGHT: + case JoinType::OUTER: + VisitOuterJoin(join, op); + return; + default: + break; + } + VisitUnsupportedOperator(op); +} + +void OuterJoinSimplification::VisitInnerOrSemiJoin(LogicalComparisonJoin &join, LogicalOperator &op) { + for (const auto &condition : join.conditions) { + if (!condition.IsComparison()) { + continue; + } + AddRequiredColumns(condition); + if (!FiltersNulls(condition.GetComparisonType())) { + continue; + } + HandleExpression(condition.GetLHS()); + HandleExpression(condition.GetRHS()); + } + VisitOperatorChildren(op); +} + +void OuterJoinSimplification::VisitOuterJoin(LogicalComparisonJoin &join, LogicalOperator &op) { + if (TryConvertLeftToAntiJoin(join)) { + AddRequiredColumns(join.conditions); VisitOperatorChildren(op); return; } + + SimplifyOuterJoinType(join); + if (join.join_type == JoinType::INNER) { + VisitOperator(op); + return; + } + + AddRequiredColumns(join.conditions); + VisitOperatorChildren(op); +} + +void OuterJoinSimplification::VisitProjection(LogicalProjection &projection, LogicalOperator &op) { + vector> direct_projection_map; + for (idx_t col_idx = 0; col_idx < projection.expressions.size(); col_idx++) { + auto &expr = *projection.expressions[col_idx]; + const ColumnBinding binding(projection.table_index, ProjectionIndex(col_idx)); + if (expr.GetExpressionClass() != ExpressionClass::BOUND_COLUMN_REF) { + AddRequiredColumns(expr); + continue; + } + + auto input_binding = expr.Cast().Binding(); + direct_projection_map.emplace_back(col_idx, input_binding); + if (null_filtered_columns.find(binding) != null_filtered_columns.end()) { + null_filtered_columns.insert(input_binding); + } + if (null_required_columns.find(binding) != null_required_columns.end()) { + null_required_columns.insert(input_binding); + } + } + VisitOperatorChildren(op); + + for (const auto &entry : direct_projection_map) { + auto col_idx = entry.first; + auto input_binding = entry.second; + if (eliminated_null_columns.find(input_binding) == eliminated_null_columns.end()) { + continue; + } + const ColumnBinding output_binding(projection.table_index, ProjectionIndex(col_idx)); + auto type = projection.expressions[col_idx]->GetReturnType(); + projection.expressions[col_idx] = make_uniq(Value(type)); + eliminated_null_columns.insert(output_binding); + } +} + +void OuterJoinSimplification::VisitFilter(LogicalFilter &filter, LogicalOperator &op) { + filter.SplitPredicates(filter.expressions); + for (const auto &expr : filter.expressions) { + HandleFilterExpression(*expr); + } + VisitOperatorChildren(op); + + for (idx_t i = 0; i < filter.expressions.size(); i++) { + ColumnBinding binding; + if (!IsNullFilter(*filter.expressions[i], binding)) { + continue; + } + if (eliminated_null_columns.find(binding) == eliminated_null_columns.end()) { + continue; + } + filter.expressions.erase_at(i); + i--; + } +} + +void OuterJoinSimplification::VisitOrder(LogicalOrder &order, LogicalOperator &op) { + for (const auto &order_node : order.orders) { + AddRequiredColumns(*order_node.expression); + } + VisitOperatorChildren(op); +} + +void OuterJoinSimplification::VisitTopN(LogicalTopN &top_n, LogicalOperator &op) { + for (const auto &order_node : top_n.orders) { + AddRequiredColumns(*order_node.expression); + } + VisitOperatorChildren(op); +} + +void OuterJoinSimplification::VisitAggregate(LogicalAggregate &aggregate, LogicalOperator &op) { + column_binding_set_t child_required_columns; + for (const auto &expr : aggregate.groups) { + AddColumnReferences(*expr, child_required_columns); + } + for (const auto &expr : aggregate.expressions) { + AddColumnReferences(*expr, child_required_columns); + } + OuterJoinSimplification child_simplification(std::move(child_required_columns)); + child_simplification.VisitOperator(*op.children[0]); +} + +void OuterJoinSimplification::VisitUnsupportedOperator(LogicalOperator &op) { + for (auto &child : op.children) { + column_binding_set_t child_required_columns; + for (const auto &binding : child->GetColumnBindings()) { + child_required_columns.insert(binding); + } + OuterJoinSimplification outer_join_simplification(std::move(child_required_columns)); + outer_join_simplification.VisitOperator(*child); + } +} + +void OuterJoinSimplification::VisitOperator(LogicalOperator &op) { + InitializeRequiredColumns(op); + + switch (op.type) { + case LogicalOperatorType::LOGICAL_COMPARISON_JOIN: { + auto &join = op.Cast(); + VisitComparisonJoin(join, op); + return; + } + case LogicalOperatorType::LOGICAL_PROJECTION: { + auto &projection = op.Cast(); + VisitProjection(projection, op); + return; + } case LogicalOperatorType::LOGICAL_FILTER: { - // Passthrough supported. Handle expressions that filter NULLs auto &filter = op.Cast(); - filter.SplitPredicates(filter.expressions); - for (const auto &expr : filter.expressions) { - if (expr->GetExpressionClass() == ExpressionClass::BOUND_OPERATOR && - expr->GetExpressionType() == ExpressionType::OPERATOR_IS_NOT_NULL) { - const auto &is_not_null = expr->Cast(); - HandleExpression(*is_not_null.GetChildren()[0]); - } else if (BoundComparisonExpression::IsComparison(*expr)) { - if (expr->GetExpressionType() == ExpressionType::COMPARE_DISTINCT_FROM || - expr->GetExpressionType() == ExpressionType::COMPARE_NOT_DISTINCT_FROM) { - continue; - } - const auto &comparison = expr->Cast(); - HandleExpression(BoundComparisonExpression::Left(comparison)); - HandleExpression(BoundComparisonExpression::Right(comparison)); - } - } + VisitFilter(filter, op); + return; + } + case LogicalOperatorType::LOGICAL_ORDER_BY: { + auto &order = op.Cast(); + VisitOrder(order, op); + return; + } + case LogicalOperatorType::LOGICAL_TOP_N: { + auto &top_n = op.Cast(); + VisitTopN(top_n, op); + return; + } + case LogicalOperatorType::LOGICAL_LIMIT: VisitOperatorChildren(op); return; + case LogicalOperatorType::LOGICAL_AGGREGATE_AND_GROUP_BY: { + auto &aggregate = op.Cast(); + VisitAggregate(aggregate, op); + return; } default: // Passthrough not supported. TODO: could pass through more operators like LOGICAL_UNION break; } - // Recurse with a new optimizer for each child - for (auto &child : op.children) { - OuterJoinSimplification outer_join_simplification; - outer_join_simplification.VisitOperator(*child); - } + VisitUnsupportedOperator(op); } } // namespace duckdb diff --git a/src/duckdb/src/optimizer/pushdown/pushdown_get.cpp b/src/duckdb/src/optimizer/pushdown/pushdown_get.cpp index 8412dbbb0..0bdb67c19 100644 --- a/src/duckdb/src/optimizer/pushdown/pushdown_get.cpp +++ b/src/duckdb/src/optimizer/pushdown/pushdown_get.cpp @@ -4,15 +4,47 @@ #include "duckdb/planner/expression/bound_columnref_expression.hpp" #include "duckdb/planner/expression/bound_operator_expression.hpp" #include "duckdb/planner/expression/bound_parameter_expression.hpp" -#include "duckdb/planner/operator/logical_filter.hpp" +#include "duckdb/planner/expression_iterator.hpp" #include "duckdb/planner/operator/logical_get.hpp" #include "duckdb/planner/operator/logical_empty_result.hpp" namespace duckdb { + +/** + * When a BoundColumnRefExpression that's part of expr (a filter) arrives here, its + * name will be set to the projection name i.e. "other" for SELECT col as other. + * If CTE inlining optimizer collapses the CTE in + * WITH cte AS (SELECT col AS other FROM reader()) SELECT * WHERE other > 0 FROM cte, + * reader() will get a complex filter with "other" which doesn't exist. + * Rename the columns back to their original names + */ +static void NormalizeColumnRefAliases(unique_ptr &expr, const LogicalGet &get) { + const vector &column_ids = get.GetColumnIds(); + ExpressionIterator::VisitExpressionMutable(expr, [&](auto &ref, auto &) { + const ColumnBinding &binding = ref.Binding(); + if (binding.table_index != get.table_index || binding.column_index >= column_ids.size()) { + return; + } + const ColumnIndex &col_idx = column_ids[binding.column_index]; + const idx_t primary = col_idx.GetPrimaryIndex(); + if (col_idx.IsVirtualColumn()) { + if (const auto it = get.virtual_columns.find(primary); it != get.virtual_columns.end()) { + ref.SetAlias(Identifier(it->second.name.GetIdentifierName())); + } + } else if (primary < get.names.size()) { + ref.SetAlias(Identifier(col_idx.GetName(get.names[primary].GetIdentifierName()))); + } + }); +} + unique_ptr FilterPushdown::PushdownGet(unique_ptr op) { D_ASSERT(op->type == LogicalOperatorType::LOGICAL_GET); auto &get = op->Cast(); + for (auto &filter : filters) { + NormalizeColumnRefAliases(filter->filter, get); + } + if (get.function.pushdown_complex_filter || get.function.filter_pushdown) { // this scan supports some form of filter push-down // check if there are any parameters diff --git a/src/duckdb/src/optimizer/rule/contains_to_in_clause.cpp b/src/duckdb/src/optimizer/rule/contains_to_in_clause.cpp index 0693edc0b..1756b6dee 100644 --- a/src/duckdb/src/optimizer/rule/contains_to_in_clause.cpp +++ b/src/duckdb/src/optimizer/rule/contains_to_in_clause.cpp @@ -9,7 +9,8 @@ namespace duckdb { ContainsToInClauseRule::ContainsToInClauseRule(ExpressionRewriter &rewriter) : Rule(rewriter) { auto func = make_uniq(); - func->function = make_uniq("contains"); + identifier_set_t functions = {"contains", "list_contains", "list_has", "array_contains", "array_has"}; + func->function = make_uniq(functions); func->matchers.push_back(make_uniq()); func->matchers.push_back(make_uniq()); func->policy = SetMatcher::Policy::ORDERED; @@ -22,23 +23,45 @@ unique_ptr ContainsToInClauseRule::Apply(LogicalOperator &op, vector auto &list_arg = expr.GetChildren()[0]; auto &probe_arg = expr.GetChildren()[1]; - if (probe_arg->IsFoldable()) { + Value list_val; + if (!ExpressionExecutor::TryEvaluateScalar(GetContext(), *list_arg, list_val)) { return nullptr; } - Value list_val; - if (!ExpressionExecutor::TryEvaluateScalar(GetContext(), *list_arg, list_val)) { + // Null list: result is always NULL regardless of the probe value. + if (list_val.IsNull()) { + changes_made = true; + return make_uniq(Value(LogicalType::BOOLEAN)); + } + + // For other types (i.e., string/map/struct) leave them alone. + if (list_val.type().id() != LogicalTypeId::LIST) { return nullptr; } - if (list_val.IsNull() || list_val.type().id() != LogicalTypeId::LIST || ListValue::GetChildren(list_val).empty()) { + + // Collect non-NULL elements from the list. + const auto &child_type = ListType::GetChildType(list_val.type()); + vector non_null_elements; + for (const auto &elem : ListValue::GetChildren(list_val)) { + if (!elem.IsNull()) { + non_null_elements.emplace_back(elem.DefaultCastAs(child_type)); + } + } + + // No non-NULL elements: never contains any value. + if (non_null_elements.empty()) { + changes_made = true; + return ExpressionRewriter::ConstantOrNull(probe_arg->Copy(), Value::BOOLEAN(false)); + } + + // Fully constant probe: let constant folding handle it. + if (probe_arg->IsFoldable()) { return nullptr; } auto in_expr = make_uniq(ExpressionType::COMPARE_IN, LogicalType::BOOLEAN); in_expr->GetChildrenMutable().push_back(probe_arg->Copy()); - const auto &child_type = ListType::GetChildType(list_val.type()); - for (const auto &elem : ListValue::GetChildren(list_val)) { - Value v = elem.DefaultCastAs(child_type); + for (auto &v : non_null_elements) { in_expr->GetChildrenMutable().push_back(make_uniq(std::move(v))); } changes_made = true; diff --git a/src/duckdb/src/optimizer/statistics/expression/propagate_cast.cpp b/src/duckdb/src/optimizer/statistics/expression/propagate_cast.cpp index c1a9bcab8..4e14e3103 100644 --- a/src/duckdb/src/optimizer/statistics/expression/propagate_cast.cpp +++ b/src/duckdb/src/optimizer/statistics/expression/propagate_cast.cpp @@ -228,6 +228,11 @@ unique_ptr StatisticsPropagator::TryPropagateCast(const BaseStat // the cast shreds every value into a single bucket - mirror the (possibly nested) source as typed stats return VariantStats::StatisticsPropagateToVariant(source, stats); } + if (source.id() == LogicalTypeId::GEOMETRY && target.id() == LogicalTypeId::GEOMETRY) { + // A geometry -> geometry cast only changes CRS metadata, not coordinates, so the bounding box, + // type set and null-ness are unchanged: propagate the statistics as-is. + return stats.Copy().ToUnique(); + } if (!CanPropagateCast(source, target)) { return nullptr; } diff --git a/src/duckdb/src/optimizer/type_pushdown.cpp b/src/duckdb/src/optimizer/type_pushdown.cpp index 6eda83067..e68a8c383 100644 --- a/src/duckdb/src/optimizer/type_pushdown.cpp +++ b/src/duckdb/src/optimizer/type_pushdown.cpp @@ -202,10 +202,11 @@ unique_ptr CastCollect::VisitReplace(BoundCastExpression &expr, uniq if (top_level_casts.count(&expr)) { col_to_cast.emplace(binding->column_index, &expr); } - } else if (it->second == nullptr || it->second->GetReturnType() != expr.GetReturnType()) { + } else if (it->second == nullptr || it->second->GetReturnType() != expr.GetReturnType() || + it->second->Cast().IsTryCast() != expr.IsTryCast()) { // Different target type, or already a conflict. - // TODO(myrrc): CAST and TRY_CAST to the same type don't conflict - // but what if reader can push down TRY_CAST but not CAST? + // If reader can push CAST but not TRY_CAST or vice versa, we can't + // pretend thery are the same it->second = nullptr; } diff --git a/src/duckdb/src/parser/parser.cpp b/src/duckdb/src/parser/parser.cpp index d55d7646e..92d767616 100644 --- a/src/duckdb/src/parser/parser.cpp +++ b/src/duckdb/src/parser/parser.cpp @@ -58,6 +58,8 @@ static bool IsValidDollarQuotedStringTagSubsequentChar(const unsigned char &c) { return IsValidDollarQuotedStringTagFirstChar(c) || (c >= '0' && c <= '9'); } +//! Throw a ParserException if `query` contains invalid UTF-8, so the tokenizer never reads past +//! bad bytes (a bad-byte query can otherwise recurse the tokenizer — see ossfuzz clusterfuzz-test-24). static void ValidateUTF8Query(const string &query) { UnicodeInvalidReason reason = UnicodeInvalidReason::INVALID_UNICODE; size_t invalid_pos = 0; @@ -220,15 +222,18 @@ void Parser::ThrowParserOverrideError(ParserOverrideResult &result) { } } -void Parser::ParseQuery(const string &query) { +string Parser::NormalizeSQLString(const string &query) { + // Validate before strip: StripUnicodeSpaces walks multi-byte sequences and assumes valid UTF-8. ValidateUTF8Query(query); - { - string new_query; - if (StripUnicodeSpaces(query, new_query)) { - ParseQuery(new_query); - return; - } + string normalized; + if (StripUnicodeSpaces(query, normalized)) { + return normalized; } + return query; +} + +void Parser::ParseQuery(const string &query_p) { + const string query = NormalizeSQLString(query_p); if (options.extensions) { bool has_strict_extension_error = false; ErrorData last_strict_extension_error; @@ -272,53 +277,11 @@ void Parser::ParseQuery(const string &query) { statements.push_back(std::move(stmt)); } } catch (ParserException &e) { - bool parsed = false; - if (options.extensions && options.extensions->HasParserExtensions()) { - idx_t failure_byte = token_cursor < tokens.size() ? tokens[token_cursor].offset : query.size(); - // SimpleToken view of the tail: text + classified type, in source order, so - // extensions can dispatch on the token stream without re-tokenizing. The extension - // reports how many of these tokens it consumed. - vector simple_tokens; - simple_tokens.reserve(tokens.size() - token_cursor); - for (idx_t i = token_cursor; i < tokens.size(); i++) { - simple_tokens.emplace_back(tokens[i].text, tokens[i].type); - } - for (auto &ext : options.extensions->ParserExtensions()) { - if (!ext.parse_function) { - continue; - } - auto result = ext.parse_function(ext.parser_info.get(), simple_tokens); - if (result.consumed_tokens < 0) { - // The extension wants to surface an error. - throw ParserException::SyntaxError(query, result.error, result.error_location); - } - if (result.consumed_tokens == 0) { - // The extension ran but did not claim this input — let the next one try. - continue; - } - // consumed_tokens > 0: the extension accepted that many leading tokens. - auto consumed = NumericCast(result.consumed_tokens); - if (consumed > simple_tokens.size()) { - throw ParserException( - "Extension returned consumed_tokens=%llu — only %llu tokens are available", - (uint64_t)consumed, (uint64_t)simple_tokens.size()); - } - // The claimed span runs from the failure point to the end of the last consumed - // token; advancing the cursor by consumed_tokens lands on a token boundary. - auto &last_token = tokens[token_cursor + consumed - 1]; - const idx_t span_end_byte = last_token.offset + last_token.length; - auto estmt = make_uniq(ext, std::move(result.parse_data)); - estmt->stmt_location = failure_byte; - estmt->stmt_length = span_end_byte - failure_byte; - statements.push_back(std::move(estmt)); - token_cursor += consumed; - parsed = true; - break; - } - } - if (!parsed) { + auto ext_stmt = TryParseExtensionStatement(tokens, token_cursor, query); + if (!ext_stmt) { throw; } + statements.push_back(std::move(ext_stmt)); } } @@ -339,6 +302,52 @@ void Parser::ParseQuery(const string &query) { } } +unique_ptr Parser::TryParseExtensionStatement(vector &tokens, idx_t &token_cursor, + const string &query) { + if (!options.extensions || !options.extensions->HasParserExtensions()) { + return nullptr; + } + idx_t failure_byte = token_cursor < tokens.size() ? tokens[token_cursor].offset : query.size(); + // SimpleToken view of the tail: text + classified type, in source order, so extensions can + // dispatch on the token stream without re-tokenizing. The extension reports how many of these + // tokens it consumed. + vector simple_tokens; + simple_tokens.reserve(tokens.size() - token_cursor); + for (idx_t i = token_cursor; i < tokens.size(); i++) { + simple_tokens.emplace_back(tokens[i].text, tokens[i].type); + } + for (auto &ext : options.extensions->ParserExtensions()) { + if (!ext.parse_function) { + continue; + } + auto result = ext.parse_function(ext.parser_info.get(), simple_tokens); + if (result.consumed_tokens < 0) { + // The extension wants to surface an error. + throw ParserException::SyntaxError(query, result.error, result.error_location); + } + if (result.consumed_tokens == 0) { + // The extension ran but did not claim this input — let the next one try. + continue; + } + // consumed_tokens > 0: the extension accepted that many leading tokens. + auto consumed = NumericCast(result.consumed_tokens); + if (consumed > simple_tokens.size()) { + throw ParserException("Extension returned consumed_tokens=%llu — only %llu tokens are available", + (uint64_t)consumed, (uint64_t)simple_tokens.size()); + } + // The claimed span runs from the failure point to the end of the last consumed token; + // advancing the cursor by consumed_tokens lands on a token boundary. + auto &last_token = tokens[token_cursor + consumed - 1]; + const idx_t span_end_byte = last_token.offset + last_token.length; + auto estmt = make_uniq(ext, std::move(result.parse_data)); + estmt->stmt_location = failure_byte; + estmt->stmt_length = span_end_byte - failure_byte; + token_cursor += consumed; + return std::move(estmt); + } + return nullptr; +} + unique_ptr Parser::ParseTopLevelStatement(vector &tokens, idx_t &token_cursor) { if (token_cursor >= tokens.size()) { return nullptr; diff --git a/src/duckdb/src/parser/peg/tokenizer/parser_tokenizer.cpp b/src/duckdb/src/parser/peg/tokenizer/parser_tokenizer.cpp index 1e9b7c472..bbfce4cee 100644 --- a/src/duckdb/src/parser/peg/tokenizer/parser_tokenizer.cpp +++ b/src/duckdb/src/parser/peg/tokenizer/parser_tokenizer.cpp @@ -3,9 +3,20 @@ namespace duckdb { +static bool IsEmptyQuotedIdentifier(const string &sql, idx_t start, idx_t end, TokenType type) { + return type == TokenType::IDENTIFIER && end == start + 2 && sql.substr(start, 2) == "\"\""; +} + ParserTokenizer::ParserTokenizer(const string &sql, vector &tokens) : BaseTokenizer(sql, tokens) { } +void ParserTokenizer::PushToken(idx_t start, idx_t end, TokenType type, bool unterminated) { + if (IsEmptyQuotedIdentifier(sql, start, end, type)) { + throw ParserException::SyntaxError(sql, "zero-length delimited identifier", optional_idx(start)); + } + BaseTokenizer::PushToken(start, end, type, unterminated); +} + void ParserTokenizer::OnStatementEnd(idx_t pos) { // Always emit ';' as a TERMINATOR token so the grammar can consume it. // Statement boundaries are determined by the PEG grammar (Program rule), not the tokenizer. diff --git a/src/duckdb/src/parser/peg/transformer/transform_generated.cpp b/src/duckdb/src/parser/peg/transformer/transform_generated.cpp index 08084d1c0..95e44986d 100644 --- a/src/duckdb/src/parser/peg/transformer/transform_generated.cpp +++ b/src/duckdb/src/parser/peg/transformer/transform_generated.cpp @@ -7966,6 +7966,23 @@ unique_ptr PEGTransformerFactory::TransformOffsetLimitClau return make_uniq>>(std::move(result)); } +unique_ptr PEGTransformerFactory::TransformOffsetFetchClauseInternal(PEGTransformer &transformer, + ParseResult &parse_result) { + auto &list_pr = parse_result.Cast(); + auto offset_clause = transformer.Transform(list_pr.GetChild(0)); + auto fetch_clause = transformer.Transform(list_pr.GetChild(1)); + auto result = TransformOffsetFetchClause(transformer, std::move(offset_clause), std::move(fetch_clause)); + return make_uniq>>(std::move(result)); +} + +unique_ptr PEGTransformerFactory::TransformFetchOnlyClauseInternal(PEGTransformer &transformer, + ParseResult &parse_result) { + auto &list_pr = parse_result.Cast(); + auto fetch_clause = transformer.Transform(list_pr.GetChild(0)); + auto result = TransformFetchOnlyClause(transformer, std::move(fetch_clause)); + return make_uniq>>(std::move(result)); +} + unique_ptr PEGTransformerFactory::TransformTableStatementInternal(PEGTransformer &transformer, ParseResult &parse_result) { auto &list_pr = parse_result.Cast(); @@ -9353,6 +9370,22 @@ unique_ptr PEGTransformerFactory::TransformLimitExpression return make_uniq>(std::move(result)); } +unique_ptr PEGTransformerFactory::TransformFetchClauseInternal(PEGTransformer &transformer, + ParseResult &parse_result) { + auto &list_pr = parse_result.Cast(); + auto fetch_value = transformer.Transform(list_pr.GetChild(2)); + auto result = std::move(fetch_value); + return make_uniq>(std::move(result)); +} + +unique_ptr PEGTransformerFactory::TransformFetchValueInternal(PEGTransformer &transformer, + ParseResult &parse_result) { + auto &list_pr = parse_result.Cast(); + auto expression = transformer.Transform>(list_pr.GetChild(0)); + auto result = TransformFetchValue(transformer, std::move(expression)); + return make_uniq>(std::move(result)); +} + unique_ptr PEGTransformerFactory::TransformAliasedExpressionInternal(PEGTransformer &transformer, ParseResult &parse_result) { auto &list_pr = parse_result.Cast(); @@ -10769,6 +10802,8 @@ void PEGTransformerFactory::RegisterGenerated() { {"LimitOffset", &PEGTransformerFactory::TransformLimitOffsetInternal}, {"LimitOffsetClause", &PEGTransformerFactory::TransformLimitOffsetClauseInternal}, {"OffsetLimitClause", &PEGTransformerFactory::TransformOffsetLimitClauseInternal}, + {"OffsetFetchClause", &PEGTransformerFactory::TransformOffsetFetchClauseInternal}, + {"FetchOnlyClause", &PEGTransformerFactory::TransformFetchOnlyClauseInternal}, {"TableStatement", &PEGTransformerFactory::TransformTableStatementInternal}, {"OptionalParensSimpleSelect", &PEGTransformerFactory::TransformOptionalParensSimpleSelectInternal}, {"SimpleSelectParens", &PEGTransformerFactory::TransformSimpleSelectParensInternal}, @@ -10897,6 +10932,8 @@ void PEGTransformerFactory::RegisterGenerated() { {"LimitAll", &PEGTransformerFactory::TransformLimitAllInternal}, {"LimitLiteralPercent", &PEGTransformerFactory::TransformLimitLiteralPercentInternal}, {"LimitExpression", &PEGTransformerFactory::TransformLimitExpressionInternal}, + {"FetchClause", &PEGTransformerFactory::TransformFetchClauseInternal}, + {"FetchValue", &PEGTransformerFactory::TransformFetchValueInternal}, {"AliasedExpression", &PEGTransformerFactory::TransformAliasedExpressionInternal}, {"ColIdExpression", &PEGTransformerFactory::TransformColIdExpressionInternal}, {"ExpressionAsCollabel", &PEGTransformerFactory::TransformExpressionAsCollabelInternal}, diff --git a/src/duckdb/src/parser/peg/transformer/transform_select.cpp b/src/duckdb/src/parser/peg/transformer/transform_select.cpp index beea0ff65..ead412c0c 100644 --- a/src/duckdb/src/parser/peg/transformer/transform_select.cpp +++ b/src/duckdb/src/parser/peg/transformer/transform_select.cpp @@ -1013,6 +1013,18 @@ PEGTransformerFactory::TransformOffsetLimitClause(PEGTransformer &transformer, L return VerifyLimitOffset(limit_clause ? *limit_clause : empty_limit, offset_clause); } +unique_ptr PEGTransformerFactory::TransformOffsetFetchClause(PEGTransformer &transformer, + LimitPercentResult offset_clause, + LimitPercentResult fetch_clause) { + return VerifyLimitOffset(fetch_clause, offset_clause); +} + +unique_ptr PEGTransformerFactory::TransformFetchOnlyClause(PEGTransformer &transformer, + LimitPercentResult fetch_clause) { + LimitPercentResult empty_offset; + return VerifyLimitOffset(fetch_clause, empty_offset); +} + unique_ptr PEGTransformerFactory::TransformTableStatement(PEGTransformer &transformer, unique_ptr base_table_name) { auto result = make_uniq(); @@ -1539,6 +1551,13 @@ LimitPercentResult PEGTransformerFactory::TransformLimitLiteralPercent(PEGTransf return result; } +LimitPercentResult PEGTransformerFactory::TransformFetchValue(PEGTransformer &transformer, + unique_ptr expression) { + LimitPercentResult result; + result.expression = std::move(expression); + return result; +} + unique_ptr PEGTransformerFactory::TransformColIdExpression(PEGTransformer &transformer, const Identifier &col_id, unique_ptr expression) { diff --git a/src/duckdb/src/planner/binder/expression/bind_unnest_expression.cpp b/src/duckdb/src/planner/binder/expression/bind_unnest_expression.cpp index 3962f0283..b98961801 100644 --- a/src/duckdb/src/planner/binder/expression/bind_unnest_expression.cpp +++ b/src/duckdb/src/planner/binder/expression/bind_unnest_expression.cpp @@ -69,7 +69,7 @@ BindResult SelectBinder::BindUnnest(FunctionExpression &function, idx_t depth, b ErrorData error; if (function.GetArguments().empty()) { - return BindResult(BinderException(function, "UNNEST() requires at lease one argument")); + return BindResult(BinderException(function, "UNNEST() requires at least one argument")); } if (inside_window || inside_aggregate || inside_try) { return BindResult(BinderException(function, UnsupportedUnnestMessage())); diff --git a/src/duckdb/src/planner/filter/expression_filter.cpp b/src/duckdb/src/planner/filter/expression_filter.cpp index e6a89658e..aaeb8eb86 100644 --- a/src/duckdb/src/planner/filter/expression_filter.cpp +++ b/src/duckdb/src/planner/filter/expression_filter.cpp @@ -164,18 +164,10 @@ bool ExpressionFilter::EvaluateWithConstant(ExpressionExecutor &executor, const } FilterPropagateResult ExpressionFilter::CheckStatistics(const BaseStatistics &stats) const { - if (stats.GetStatsType() == StatisticsType::GEOMETRY_STATS) { - // Delegate to GeometryStats for geometry types - return GeometryStats::CheckZonemap(stats, expr); - } return CheckExpressionStatistics(*expr, stats); } FilterPropagateResult ExpressionFilter::CheckStatistics(ClientContext &context, const BaseStatistics &stats) const { - if (stats.GetStatsType() == StatisticsType::GEOMETRY_STATS) { - // Delegate to GeometryStats for geometry types - return GeometryStats::CheckZonemap(stats, expr); - } return CheckExpressionStatistics(&context, *expr, stats); } @@ -395,16 +387,16 @@ static FilterPropagateResult CheckFunctionStatistics(optional_ptr if (!func_expr.Function().HasFilterPruneCallback()) { return FilterPropagateResult::NO_PRUNING_POSSIBLE; } + // Derive the statistics of each argument. This lets a callback prune regardless of which argument is the column and + // which is the constant (e.g. `foo(col, const)` vs `foo(const, col`). + // Entries are null when an argument's stats can't be derived. vector> owned_stats; - auto filter_stats = &stats; - if (!func_expr.GetChildren().empty()) { - auto child_stats = TryGetFilterStats(context_p, *func_expr.GetChildren()[0], stats, owned_stats); - if (!child_stats) { - return FilterPropagateResult::NO_PRUNING_POSSIBLE; - } - filter_stats = child_stats.get(); + vector> child_stats; + child_stats.reserve(func_expr.GetChildren().size()); + for (auto &child : func_expr.GetChildren()) { + child_stats.push_back(TryGetFilterStats(context_p, *child, stats, owned_stats)); } - FunctionStatisticsPruneInput input(func_expr.BindInfo().get(), *filter_stats); + FunctionStatisticsPruneInput input(func_expr, func_expr.BindInfo().get(), child_stats); return func_expr.Function().GetFilterPruneCallback()(input); } diff --git a/src/duckdb/src/planner/filter/table_filter_bloom_function.cpp b/src/duckdb/src/planner/filter/table_filter_bloom_function.cpp index b46827965..6942ad483 100644 --- a/src/duckdb/src/planner/filter/table_filter_bloom_function.cpp +++ b/src/duckdb/src/planner/filter/table_filter_bloom_function.cpp @@ -260,28 +260,33 @@ FilterPropagateResult BloomFilterScalarFun::FilterPrune(const FunctionStatistics if (!data.filter || !data.filter->IsInitialized()) { return FilterPropagateResult::NO_PRUNING_POSSIBLE; } + auto column_stats = input.ChildStats(0); + if (!column_stats) { + return FilterPropagateResult::NO_PRUNING_POSSIBLE; + } + auto &stats = *column_stats; switch (data.key_type.InternalType()) { case PhysicalType::UINT8: - return TemplatedBloomFilterPrune(*data.filter, input.stats); + return TemplatedBloomFilterPrune(*data.filter, stats); case PhysicalType::UINT16: - return TemplatedBloomFilterPrune(*data.filter, input.stats); + return TemplatedBloomFilterPrune(*data.filter, stats); case PhysicalType::UINT32: - return TemplatedBloomFilterPrune(*data.filter, input.stats); + return TemplatedBloomFilterPrune(*data.filter, stats); case PhysicalType::UINT64: - return TemplatedBloomFilterPrune(*data.filter, input.stats); + return TemplatedBloomFilterPrune(*data.filter, stats); case PhysicalType::UINT128: - return TemplatedBloomFilterPrune(*data.filter, input.stats); + return TemplatedBloomFilterPrune(*data.filter, stats); case PhysicalType::INT8: - return TemplatedBloomFilterPrune(*data.filter, input.stats); + return TemplatedBloomFilterPrune(*data.filter, stats); case PhysicalType::INT16: - return TemplatedBloomFilterPrune(*data.filter, input.stats); + return TemplatedBloomFilterPrune(*data.filter, stats); case PhysicalType::INT32: - return TemplatedBloomFilterPrune(*data.filter, input.stats); + return TemplatedBloomFilterPrune(*data.filter, stats); case PhysicalType::INT64: - return TemplatedBloomFilterPrune(*data.filter, input.stats); + return TemplatedBloomFilterPrune(*data.filter, stats); case PhysicalType::INT128: - return TemplatedBloomFilterPrune(*data.filter, input.stats); + return TemplatedBloomFilterPrune(*data.filter, stats); default: return FilterPropagateResult::NO_PRUNING_POSSIBLE; } diff --git a/src/duckdb/src/planner/filter/table_filter_dynamic_function.cpp b/src/duckdb/src/planner/filter/table_filter_dynamic_function.cpp index 8441d22c4..f6d68b1b3 100644 --- a/src/duckdb/src/planner/filter/table_filter_dynamic_function.cpp +++ b/src/duckdb/src/planner/filter/table_filter_dynamic_function.cpp @@ -146,7 +146,11 @@ FilterPropagateResult DynamicFilterScalarFun::FilterPrune(const FunctionStatisti if (!data.filter_data->initialized) { return FilterPropagateResult::NO_PRUNING_POSSIBLE; } - return DynamicFilterData::CheckStatistics(input.stats, data.filter_data->comparison_type, + auto column_stats = input.ChildStats(0); + if (!column_stats) { + return FilterPropagateResult::NO_PRUNING_POSSIBLE; + } + return DynamicFilterData::CheckStatistics(*column_stats, data.filter_data->comparison_type, data.filter_data->constant); } diff --git a/src/duckdb/src/planner/filter/table_filter_optional_function.cpp b/src/duckdb/src/planner/filter/table_filter_optional_function.cpp index 2769f412a..67437c323 100644 --- a/src/duckdb/src/planner/filter/table_filter_optional_function.cpp +++ b/src/duckdb/src/planner/filter/table_filter_optional_function.cpp @@ -73,7 +73,11 @@ FilterPropagateResult OptionalFilterScalarFun::FilterPrune(const FunctionStatist if (!data.child_filter_expr) { return FilterPropagateResult::NO_PRUNING_POSSIBLE; } - return ExpressionFilter::CheckExpressionStatistics(*data.child_filter_expr, input.stats); + auto column_stats = input.ChildStats(0); + if (!column_stats) { + return FilterPropagateResult::NO_PRUNING_POSSIBLE; + } + return ExpressionFilter::CheckExpressionStatistics(*data.child_filter_expr, *column_stats); } string OptionalFilterScalarFun::ToString(const string &child_filter_string) { diff --git a/src/duckdb/src/planner/filter/table_filter_prefix_range_function.cpp b/src/duckdb/src/planner/filter/table_filter_prefix_range_function.cpp index 488100e7e..d7c5f4995 100644 --- a/src/duckdb/src/planner/filter/table_filter_prefix_range_function.cpp +++ b/src/duckdb/src/planner/filter/table_filter_prefix_range_function.cpp @@ -547,26 +547,31 @@ FilterPropagateResult PrefixRangeScalarFun::FilterPrune(const FunctionStatistics if (!data.filter || !data.filter->IsInitialized()) { return FilterPropagateResult::NO_PRUNING_POSSIBLE; } - switch (input.stats.GetStatsType()) { + auto column_stats = input.ChildStats(0); + if (!column_stats) { + return FilterPropagateResult::NO_PRUNING_POSSIBLE; + } + auto &stats = *column_stats; + switch (stats.GetStatsType()) { case StatisticsType::NUMERIC_STATS: { - if (!NumericStats::HasMinMax(input.stats)) { + if (!NumericStats::HasMinMax(stats)) { return FilterPropagateResult::NO_PRUNING_POSSIBLE; } - const auto min = NumericStats::Min(input.stats); - const auto max = NumericStats::Max(input.stats); + const auto min = NumericStats::Min(stats); + const auto max = NumericStats::Max(stats); if (min > max) { return FilterPropagateResult::NO_PRUNING_POSSIBLE; } return data.filter->LookupRange(min, max); } case StatisticsType::STRING_STATS: { - if (!StringStats::HasMinMax(input.stats)) { + if (!StringStats::HasMinMax(stats)) { return FilterPropagateResult::NO_PRUNING_POSSIBLE; } // String stats may contain raw parquet bytes that are not valid UTF-8. Reconstruct them as BLOBs so the // prefix-range comparable logic can inspect the raw bytes without value-construction validation. - return data.filter->LookupRange(Value::BLOB_RAW(StringStats::Min(input.stats)), - Value::BLOB_RAW(StringStats::Max(input.stats))); + return data.filter->LookupRange(Value::BLOB_RAW(StringStats::Min(stats)), + Value::BLOB_RAW(StringStats::Max(stats))); } default: return FilterPropagateResult::NO_PRUNING_POSSIBLE; diff --git a/src/duckdb/src/planner/filter/table_filter_selectivity_optional_function.cpp b/src/duckdb/src/planner/filter/table_filter_selectivity_optional_function.cpp index 2dfb35f76..38f6327e2 100644 --- a/src/duckdb/src/planner/filter/table_filter_selectivity_optional_function.cpp +++ b/src/duckdb/src/planner/filter/table_filter_selectivity_optional_function.cpp @@ -172,7 +172,11 @@ FilterPropagateResult SelectivityOptionalFilterScalarFun::FilterPrune(const Func if (!data.child_filter_expr) { return FilterPropagateResult::NO_PRUNING_POSSIBLE; } - return ExpressionFilter::CheckExpressionStatistics(*data.child_filter_expr, input.stats); + auto column_stats = input.ChildStats(0); + if (!column_stats) { + return FilterPropagateResult::NO_PRUNING_POSSIBLE; + } + return ExpressionFilter::CheckExpressionStatistics(*data.child_filter_expr, *column_stats); } string SelectivityOptionalFilterScalarFun::ToString(const string &child_filter_string) { diff --git a/src/duckdb/src/storage/statistics/geometry_stats.cpp b/src/duckdb/src/storage/statistics/geometry_stats.cpp index 783f92520..7949c2265 100644 --- a/src/duckdb/src/storage/statistics/geometry_stats.cpp +++ b/src/duckdb/src/storage/statistics/geometry_stats.cpp @@ -4,9 +4,6 @@ #include "duckdb/common/types/vector.hpp" #include "duckdb/common/serializer/serializer.hpp" #include "duckdb/common/serializer/deserializer.hpp" -#include "duckdb/planner/expression/bound_cast_expression.hpp" -#include "duckdb/planner/expression/bound_constant_expression.hpp" -#include "duckdb/planner/expression/bound_function_expression.hpp" namespace duckdb { @@ -234,114 +231,4 @@ const GeometryStatsFlags &GeometryStats::GetFlags(const BaseStatistics &stats) { return GetDataUnsafe(stats).flags; } -// Expression comparison pruning -static FilterPropagateResult CheckIntersectionFilter(const GeometryStatsData &data, const Value &constant) { - if (constant.IsNull() || constant.type().id() != LogicalTypeId::GEOMETRY) { - // Cannot prune against NULL - return FilterPropagateResult::NO_PRUNING_POSSIBLE; - } - - // This has been checked before and needs to be true for the checks below to be valid. - // Note: only one axis needs to be set; an unknown axis is an infinite range that - // intersects everything, so the IntersectsXY/ContainsXY math below stays valid. - D_ASSERT(data.extent.CanPruneXY()); - - const auto &geom = StringValue::Get(constant); - auto extent = GeometryExtent::Empty(); - if (Geometry::GetExtent(string_t(geom), extent) == 0) { - // If the geometry is empty, the predicate will never match - return FilterPropagateResult::FILTER_ALWAYS_FALSE; - } - - // Check if the bounding boxes intersect - // If the bounding boxes do not intersect, the predicate will never match - if (!extent.IntersectsXY(data.extent)) { - return FilterPropagateResult::FILTER_ALWAYS_FALSE; - } - - // If the column is completely inside the bounds, the predicate will always match - if (extent.ContainsXY(data.extent)) { - return FilterPropagateResult::FILTER_ALWAYS_TRUE; - } - - // We cannot prune, as this column may contain geometries that intersect - return FilterPropagateResult::NO_PRUNING_POSSIBLE; -} - -FilterPropagateResult GeometryStats::CheckZonemap(const BaseStatistics &stats, const unique_ptr &expr) { - if (expr->GetExpressionClass() != ExpressionClass::BOUND_FUNCTION) { - return FilterPropagateResult::NO_PRUNING_POSSIBLE; - } - if (expr->GetReturnType() != LogicalType::BOOLEAN) { - return FilterPropagateResult::NO_PRUNING_POSSIBLE; - } - const auto &func = expr->Cast(); - if (func.GetChildren().size() != 2) { - return FilterPropagateResult::NO_PRUNING_POSSIBLE; - } - - if (func.GetChildren()[0]->GetReturnType().id() != LogicalTypeId::GEOMETRY || - func.GetChildren()[1]->GetReturnType().id() != LogicalTypeId::GEOMETRY) { - return FilterPropagateResult::NO_PRUNING_POSSIBLE; - } - - // The set of geometry predicates that can be optimized using the bounding box - static constexpr const char *geometry_predicates[2] = {"&&", "st_intersects_extent"}; - - auto found = false; - for (const auto &name : geometry_predicates) { - if (func.Function().GetName() == name) { - found = true; - break; - } - } - if (!found) { - // Not a geometry predicate we can optimize - return FilterPropagateResult::NO_PRUNING_POSSIBLE; - } - - // The column reference may be wrapped in a GEOMETRY -> GEOMETRY cast (e.g. a CRS-erasing cast inserted to match - // the predicate's argument type). Such casts only change CRS metadata, not coordinates, so the bounding box - // remains valid. Look through them when classifying the operands. - auto strip_geometry_cast = [](const Expression &child) -> const Expression * { - if (child.GetExpressionType() == ExpressionType::OPERATOR_CAST) { - auto &cast = child.Cast(); - if (cast.Child().GetReturnType().id() == LogicalTypeId::GEOMETRY) { - return &cast.Child(); - } - } - return &child; - }; - - const auto &lhs = *strip_geometry_cast(*func.GetChildren()[0]); - const auto &rhs = *strip_geometry_cast(*func.GetChildren()[1]); - const auto lhs_kind = lhs.GetExpressionType(); - const auto rhs_kind = rhs.GetExpressionType(); - const auto lhs_is_const = lhs_kind == ExpressionType::VALUE_CONSTANT && rhs_kind == ExpressionType::BOUND_REF; - const auto rhs_is_const = rhs_kind == ExpressionType::VALUE_CONSTANT && lhs_kind == ExpressionType::BOUND_REF; - - if (!stats.CanHaveNoNull()) { - // no non-null values are possible: always false - return FilterPropagateResult::FILTER_ALWAYS_FALSE; - } - - auto &data = GetDataUnsafe(stats); - - if (!data.extent.CanPruneXY()) { - // If neither axis is set (the extent is empty or fully unknown), we cannot prune. - // A single known axis is enough: the unknown axis is an infinite range that - // intersects everything, so pruning degrades to the known axis. - return FilterPropagateResult::NO_PRUNING_POSSIBLE; - } - - if (lhs_is_const) { - return CheckIntersectionFilter(data, lhs.Cast().GetValue()); - } - if (rhs_is_const) { - return CheckIntersectionFilter(data, rhs.Cast().GetValue()); - } - // Else, no constant argument - return FilterPropagateResult::NO_PRUNING_POSSIBLE; -} - } // namespace duckdb diff --git a/src/duckdb/ub_src_main.cpp b/src/duckdb/ub_src_main.cpp index 8df1fa6ca..df2bd02e5 100644 --- a/src/duckdb/ub_src_main.cpp +++ b/src/duckdb/ub_src_main.cpp @@ -44,6 +44,8 @@ #include "src/main/materialized_query_result.cpp" +#include "src/main/parse_iterator.cpp" + #include "src/main/pending_query_result.cpp" #include "src/main/prepared_statement.cpp" @@ -58,6 +60,8 @@ #include "src/main/result_set_manager.cpp" +#include "src/main/statement_iterator.cpp" + #include "src/main/stream_query_result.cpp" #include "src/main/user_settings.cpp"