diff --git a/src/iceberg/expression/inclusive_metrics_evaluator.cc b/src/iceberg/expression/inclusive_metrics_evaluator.cc index 29f5aba24..79ec23665 100644 --- a/src/iceberg/expression/inclusive_metrics_evaluator.cc +++ b/src/iceberg/expression/inclusive_metrics_evaluator.cc @@ -223,6 +223,13 @@ class InclusiveMetricsVisitor : public BoundVisitor { Result NotEq(const std::shared_ptr& expr, const Literal& lit) override { // because the bounds are not necessarily a min or max value, this cannot be answered // using them. notEq(col, X) with (X, Y) doesn't guarantee that X is a value in col. + // However, when min == max and the file has no nulls or NaN values, we can safely + // prune if that value equals the literal. + ICEBERG_ASSIGN_OR_RAISE(auto value, UniqueValue(expr)); + if (value.has_value() && value.value() == lit) { + return kRowCannotMatch; + } + return kRowsMightMatch; } @@ -271,7 +278,13 @@ class InclusiveMetricsVisitor : public BoundVisitor { const BoundSetPredicate::LiteralSet& literal_set) override { // because the bounds are not necessarily a min or max value, this cannot be answered // using them. notIn(col, {X, ...}) with (X, Y) doesn't guarantee that X is a value in - // col. + // col. However, when min == max and the file has no nulls or NaN values, we can + // safely prune if that value is in the exclusion set. + ICEBERG_ASSIGN_OR_RAISE(auto value, UniqueValue(expr)); + if (value.has_value() && literal_set.contains(value.value())) { + return kRowCannotMatch; + } + return kRowsMightMatch; } @@ -416,6 +429,34 @@ class InclusiveMetricsVisitor : public BoundVisitor { // TODO(xiao.dong) handle extract lower and upper bounds } + /// Returns the column's single value if all rows contain the same value. Defined as a + /// column with no nulls, no NaNs, and lower bound equals upper bound. Returns + /// std::nullopt otherwise. + Result> UniqueValue(const std::shared_ptr& expr) { + int32_t id = expr->reference()->field().field_id(); + if (MayContainNull(id)) { + return std::nullopt; + } + + ICEBERG_ASSIGN_OR_RAISE(auto lower, LowerBound(expr)); + ICEBERG_ASSIGN_OR_RAISE(auto upper, UpperBound(expr)); + if (!lower.has_value() || !upper.has_value() || lower->IsNull() || upper->IsNull() || + lower->IsNaN() || upper->IsNaN()) { + return std::nullopt; + } + + auto nan_it = data_file_.nan_value_counts.find(id); + if (nan_it != data_file_.nan_value_counts.cend() && nan_it->second != 0) { + return std::nullopt; + } + + if (lower.value() != upper.value()) { + return std::nullopt; + } + + return lower; + } + Result> ParseLowerBound(const BoundReference& ref) { int32_t id = ref.field().field_id(); auto type = ref.type(); diff --git a/src/iceberg/test/inclusive_metrics_evaluator_test.cc b/src/iceberg/test/inclusive_metrics_evaluator_test.cc index 27867f1a4..f4e24ffe0 100644 --- a/src/iceberg/test/inclusive_metrics_evaluator_test.cc +++ b/src/iceberg/test/inclusive_metrics_evaluator_test.cc @@ -945,4 +945,158 @@ TEST_F(InclusiveMetricsEvaluatorMigratedTest, IntegerNotInTest) { RunTest(Expressions::NotIn("id", ids), kRowsMightMatch, file1_); } +TEST_F(InclusiveMetricsEvaluatorMigratedTest, NotEqWithSingleValue) { + // file has a range of values, cannot prune based on a single literal + auto range_of_values = std::make_shared(); + range_of_values->file_path = "range_of_values"; + range_of_values->file_format = FileFormatType::kParquet; + range_of_values->record_count = 10; + range_of_values->value_counts = {{3, 10L}}; + range_of_values->null_value_counts = {{3, 0L}}; + range_of_values->nan_value_counts = {{3, 0L}}; + range_of_values->lower_bounds = {{3, Literal::String("aaa").Serialize().value()}}; + range_of_values->upper_bounds = {{3, Literal::String("zzz").Serialize().value()}}; + RunTest(Expressions::NotEqual("required", Literal::String("aaa")), kRowsMightMatch, + range_of_values); + + // file contains a single value (lower == upper) with no nulls/NaNs + auto single_value = std::make_shared(); + single_value->file_path = "single_value"; + single_value->file_format = FileFormatType::kParquet; + single_value->record_count = 10; + single_value->value_counts = {{3, 10L}}; + single_value->null_value_counts = {{3, 0L}}; + single_value->nan_value_counts = {{3, 0L}}; + single_value->lower_bounds = {{3, Literal::String("abc").Serialize().value()}}; + single_value->upper_bounds = {{3, Literal::String("abc").Serialize().value()}}; + // single value equals the literal -> rows cannot match + RunTest(Expressions::NotEqual("required", Literal::String("abc")), kRowCannotMatch, + single_value); + // single value differs from the literal -> rows might match + RunTest(Expressions::NotEqual("required", Literal::String("def")), kRowsMightMatch, + single_value); + + // single value but the file has nulls, which satisfy the != predicate + auto single_value_with_nulls = std::make_shared(); + single_value_with_nulls->file_path = "single_value_nulls"; + single_value_with_nulls->file_format = FileFormatType::kParquet; + single_value_with_nulls->record_count = 10; + single_value_with_nulls->value_counts = {{3, 10L}}; + single_value_with_nulls->null_value_counts = {{3, 2L}}; + single_value_with_nulls->nan_value_counts = {{3, 0L}}; + single_value_with_nulls->lower_bounds = { + {3, Literal::String("abc").Serialize().value()}}; + single_value_with_nulls->upper_bounds = { + {3, Literal::String("abc").Serialize().value()}}; + RunTest(Expressions::NotEqual("required", Literal::String("abc")), kRowsMightMatch, + single_value_with_nulls); + + // single value but the file has NaNs, which satisfy the != predicate + auto single_value_with_nan = std::make_shared(); + single_value_with_nan->file_path = "single_value_nan"; + single_value_with_nan->file_format = FileFormatType::kParquet; + single_value_with_nan->record_count = 10; + single_value_with_nan->value_counts = {{9, 10L}}; + single_value_with_nan->null_value_counts = {{9, 0L}}; + single_value_with_nan->nan_value_counts = {{9, 2L}}; + single_value_with_nan->lower_bounds = {{9, Literal::Float(5.0F).Serialize().value()}}; + single_value_with_nan->upper_bounds = {{9, Literal::Float(5.0F).Serialize().value()}}; + RunTest(Expressions::NotEqual("no_nans", Literal::Float(5.0F)), kRowsMightMatch, + single_value_with_nan); + + // bounds are NaN -> unreliable, cannot prune + auto single_value_nan_bounds = std::make_shared(); + single_value_nan_bounds->file_path = "single_value_nan_bounds"; + single_value_nan_bounds->file_format = FileFormatType::kParquet; + single_value_nan_bounds->record_count = 10; + single_value_nan_bounds->value_counts = {{9, 10L}}; + single_value_nan_bounds->null_value_counts = {{9, 0L}}; + single_value_nan_bounds->nan_value_counts = {{9, 0L}}; + single_value_nan_bounds->lower_bounds = { + {9, Literal::Float(kFloatNan).Serialize().value()}}; + single_value_nan_bounds->upper_bounds = { + {9, Literal::Float(kFloatNan).Serialize().value()}}; + RunTest(Expressions::NotEqual("no_nans", Literal::Float(5.0F)), kRowsMightMatch, + single_value_nan_bounds); +} + +TEST_F(InclusiveMetricsEvaluatorMigratedTest, NotInWithSingleValue) { + // file has a range of values, cannot prune based on the exclusion set + auto range_of_values = std::make_shared(); + range_of_values->file_path = "range_of_values"; + range_of_values->file_format = FileFormatType::kParquet; + range_of_values->record_count = 10; + range_of_values->value_counts = {{3, 10L}}; + range_of_values->null_value_counts = {{3, 0L}}; + range_of_values->nan_value_counts = {{3, 0L}}; + range_of_values->lower_bounds = {{3, Literal::String("aaa").Serialize().value()}}; + range_of_values->upper_bounds = {{3, Literal::String("zzz").Serialize().value()}}; + RunTest( + Expressions::NotIn("required", {Literal::String("aaa"), Literal::String("bbb")}), + kRowsMightMatch, range_of_values); + + // file contains a single value (lower == upper) with no nulls/NaNs + auto single_value = std::make_shared(); + single_value->file_path = "single_value"; + single_value->file_format = FileFormatType::kParquet; + single_value->record_count = 10; + single_value->value_counts = {{3, 10L}}; + single_value->null_value_counts = {{3, 0L}}; + single_value->nan_value_counts = {{3, 0L}}; + single_value->lower_bounds = {{3, Literal::String("abc").Serialize().value()}}; + single_value->upper_bounds = {{3, Literal::String("abc").Serialize().value()}}; + // single value is in the exclusion set -> rows cannot match + RunTest( + Expressions::NotIn("required", {Literal::String("abc"), Literal::String("def")}), + kRowCannotMatch, single_value); + // single value is not in the exclusion set -> rows might match + RunTest( + Expressions::NotIn("required", {Literal::String("def"), Literal::String("ghi")}), + kRowsMightMatch, single_value); + + // single value but the file has nulls, which satisfy the notIn predicate + auto single_value_with_nulls = std::make_shared(); + single_value_with_nulls->file_path = "single_value_nulls"; + single_value_with_nulls->file_format = FileFormatType::kParquet; + single_value_with_nulls->record_count = 10; + single_value_with_nulls->value_counts = {{3, 10L}}; + single_value_with_nulls->null_value_counts = {{3, 2L}}; + single_value_with_nulls->nan_value_counts = {{3, 0L}}; + single_value_with_nulls->lower_bounds = { + {3, Literal::String("abc").Serialize().value()}}; + single_value_with_nulls->upper_bounds = { + {3, Literal::String("abc").Serialize().value()}}; + RunTest( + Expressions::NotIn("required", {Literal::String("abc"), Literal::String("def")}), + kRowsMightMatch, single_value_with_nulls); + + // single value but the file has NaNs, which satisfy the notIn predicate + auto single_value_with_nan = std::make_shared(); + single_value_with_nan->file_path = "single_value_nan"; + single_value_with_nan->file_format = FileFormatType::kParquet; + single_value_with_nan->record_count = 10; + single_value_with_nan->value_counts = {{9, 10L}}; + single_value_with_nan->null_value_counts = {{9, 0L}}; + single_value_with_nan->nan_value_counts = {{9, 2L}}; + single_value_with_nan->lower_bounds = {{9, Literal::Float(5.0F).Serialize().value()}}; + single_value_with_nan->upper_bounds = {{9, Literal::Float(5.0F).Serialize().value()}}; + RunTest(Expressions::NotIn("no_nans", {Literal::Float(5.0F)}), kRowsMightMatch, + single_value_with_nan); + + // bounds are NaN -> unreliable, cannot prune + auto single_value_nan_bounds = std::make_shared(); + single_value_nan_bounds->file_path = "single_value_nan_bounds"; + single_value_nan_bounds->file_format = FileFormatType::kParquet; + single_value_nan_bounds->record_count = 10; + single_value_nan_bounds->value_counts = {{9, 10L}}; + single_value_nan_bounds->null_value_counts = {{9, 0L}}; + single_value_nan_bounds->nan_value_counts = {{9, 0L}}; + single_value_nan_bounds->lower_bounds = { + {9, Literal::Float(kFloatNan).Serialize().value()}}; + single_value_nan_bounds->upper_bounds = { + {9, Literal::Float(kFloatNan).Serialize().value()}}; + RunTest(Expressions::NotIn("no_nans", {Literal::Float(5.0F)}), kRowsMightMatch, + single_value_nan_bounds); +} + } // namespace iceberg