Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 42 additions & 1 deletion src/iceberg/expression/inclusive_metrics_evaluator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,13 @@ class InclusiveMetricsVisitor : public BoundVisitor<bool> {
Result<bool> NotEq(const std::shared_ptr<Bound>& expr, const Literal& lit) override {
// because the bounds are not necessarily a min or max value, this cannot be answered
// using them. notEq(col, X) with (X, Y) doesn't guarantee that X is a value in col.
// However, when min == max and the file has no nulls or NaN values, we can safely
// prune if that value equals the literal.
ICEBERG_ASSIGN_OR_RAISE(auto value, UniqueValue(expr));
if (value.has_value() && value.value() == lit) {
return kRowCannotMatch;
}

return kRowsMightMatch;
}

Expand Down Expand Up @@ -271,7 +278,13 @@ class InclusiveMetricsVisitor : public BoundVisitor<bool> {
const BoundSetPredicate::LiteralSet& literal_set) override {
// because the bounds are not necessarily a min or max value, this cannot be answered
// using them. notIn(col, {X, ...}) with (X, Y) doesn't guarantee that X is a value in
// col.
// col. However, when min == max and the file has no nulls or NaN values, we can
// safely prune if that value is in the exclusion set.
ICEBERG_ASSIGN_OR_RAISE(auto value, UniqueValue(expr));
if (value.has_value() && literal_set.contains(value.value())) {
return kRowCannotMatch;
}

return kRowsMightMatch;
}

Expand Down Expand Up @@ -416,6 +429,34 @@ class InclusiveMetricsVisitor : public BoundVisitor<bool> {
// TODO(xiao.dong) handle extract lower and upper bounds
}

/// Returns the column's single value if all rows contain the same value. Defined as a
/// column with no nulls, no NaNs, and lower bound equals upper bound. Returns
/// std::nullopt otherwise.
Result<std::optional<Literal>> UniqueValue(const std::shared_ptr<Bound>& expr) {
int32_t id = expr->reference()->field().field_id();
if (MayContainNull(id)) {
return std::nullopt;
}

ICEBERG_ASSIGN_OR_RAISE(auto lower, LowerBound(expr));
ICEBERG_ASSIGN_OR_RAISE(auto upper, UpperBound(expr));
if (!lower.has_value() || !upper.has_value() || lower->IsNull() || upper->IsNull() ||
lower->IsNaN() || upper->IsNaN()) {
return std::nullopt;
}

auto nan_it = data_file_.nan_value_counts.find(id);
if (nan_it != data_file_.nan_value_counts.cend() && nan_it->second != 0) {
return std::nullopt;
}

if (lower.value() != upper.value()) {
return std::nullopt;
}

return lower;
}

Result<std::optional<Literal>> ParseLowerBound(const BoundReference& ref) {
int32_t id = ref.field().field_id();
auto type = ref.type();
Expand Down
154 changes: 154 additions & 0 deletions src/iceberg/test/inclusive_metrics_evaluator_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -945,4 +945,158 @@ TEST_F(InclusiveMetricsEvaluatorMigratedTest, IntegerNotInTest) {
RunTest(Expressions::NotIn("id", ids), kRowsMightMatch, file1_);
}

TEST_F(InclusiveMetricsEvaluatorMigratedTest, NotEqWithSingleValue) {
// file has a range of values, cannot prune based on a single literal
auto range_of_values = std::make_shared<DataFile>();
range_of_values->file_path = "range_of_values";
range_of_values->file_format = FileFormatType::kParquet;
range_of_values->record_count = 10;
range_of_values->value_counts = {{3, 10L}};
range_of_values->null_value_counts = {{3, 0L}};
range_of_values->nan_value_counts = {{3, 0L}};
range_of_values->lower_bounds = {{3, Literal::String("aaa").Serialize().value()}};
range_of_values->upper_bounds = {{3, Literal::String("zzz").Serialize().value()}};
RunTest(Expressions::NotEqual("required", Literal::String("aaa")), kRowsMightMatch,
range_of_values);

// file contains a single value (lower == upper) with no nulls/NaNs
auto single_value = std::make_shared<DataFile>();
single_value->file_path = "single_value";
single_value->file_format = FileFormatType::kParquet;
single_value->record_count = 10;
single_value->value_counts = {{3, 10L}};
single_value->null_value_counts = {{3, 0L}};
single_value->nan_value_counts = {{3, 0L}};
single_value->lower_bounds = {{3, Literal::String("abc").Serialize().value()}};
single_value->upper_bounds = {{3, Literal::String("abc").Serialize().value()}};
// single value equals the literal -> rows cannot match
RunTest(Expressions::NotEqual("required", Literal::String("abc")), kRowCannotMatch,
single_value);
// single value differs from the literal -> rows might match
RunTest(Expressions::NotEqual("required", Literal::String("def")), kRowsMightMatch,
single_value);

// single value but the file has nulls, which satisfy the != predicate
auto single_value_with_nulls = std::make_shared<DataFile>();
single_value_with_nulls->file_path = "single_value_nulls";
single_value_with_nulls->file_format = FileFormatType::kParquet;
single_value_with_nulls->record_count = 10;
single_value_with_nulls->value_counts = {{3, 10L}};
single_value_with_nulls->null_value_counts = {{3, 2L}};
single_value_with_nulls->nan_value_counts = {{3, 0L}};
single_value_with_nulls->lower_bounds = {
{3, Literal::String("abc").Serialize().value()}};
single_value_with_nulls->upper_bounds = {
{3, Literal::String("abc").Serialize().value()}};
RunTest(Expressions::NotEqual("required", Literal::String("abc")), kRowsMightMatch,
single_value_with_nulls);

// single value but the file has NaNs, which satisfy the != predicate
auto single_value_with_nan = std::make_shared<DataFile>();
single_value_with_nan->file_path = "single_value_nan";
single_value_with_nan->file_format = FileFormatType::kParquet;
single_value_with_nan->record_count = 10;
single_value_with_nan->value_counts = {{9, 10L}};
single_value_with_nan->null_value_counts = {{9, 0L}};
single_value_with_nan->nan_value_counts = {{9, 2L}};
single_value_with_nan->lower_bounds = {{9, Literal::Float(5.0F).Serialize().value()}};
single_value_with_nan->upper_bounds = {{9, Literal::Float(5.0F).Serialize().value()}};
RunTest(Expressions::NotEqual("no_nans", Literal::Float(5.0F)), kRowsMightMatch,
single_value_with_nan);

// bounds are NaN -> unreliable, cannot prune
auto single_value_nan_bounds = std::make_shared<DataFile>();
single_value_nan_bounds->file_path = "single_value_nan_bounds";
single_value_nan_bounds->file_format = FileFormatType::kParquet;
single_value_nan_bounds->record_count = 10;
single_value_nan_bounds->value_counts = {{9, 10L}};
single_value_nan_bounds->null_value_counts = {{9, 0L}};
single_value_nan_bounds->nan_value_counts = {{9, 0L}};
single_value_nan_bounds->lower_bounds = {
{9, Literal::Float(kFloatNan).Serialize().value()}};
single_value_nan_bounds->upper_bounds = {
{9, Literal::Float(kFloatNan).Serialize().value()}};
RunTest(Expressions::NotEqual("no_nans", Literal::Float(5.0F)), kRowsMightMatch,
single_value_nan_bounds);
}

TEST_F(InclusiveMetricsEvaluatorMigratedTest, NotInWithSingleValue) {
// file has a range of values, cannot prune based on the exclusion set
auto range_of_values = std::make_shared<DataFile>();
range_of_values->file_path = "range_of_values";
range_of_values->file_format = FileFormatType::kParquet;
range_of_values->record_count = 10;
range_of_values->value_counts = {{3, 10L}};
range_of_values->null_value_counts = {{3, 0L}};
range_of_values->nan_value_counts = {{3, 0L}};
range_of_values->lower_bounds = {{3, Literal::String("aaa").Serialize().value()}};
range_of_values->upper_bounds = {{3, Literal::String("zzz").Serialize().value()}};
RunTest(
Expressions::NotIn("required", {Literal::String("aaa"), Literal::String("bbb")}),
kRowsMightMatch, range_of_values);

// file contains a single value (lower == upper) with no nulls/NaNs
auto single_value = std::make_shared<DataFile>();
single_value->file_path = "single_value";
single_value->file_format = FileFormatType::kParquet;
single_value->record_count = 10;
single_value->value_counts = {{3, 10L}};
single_value->null_value_counts = {{3, 0L}};
single_value->nan_value_counts = {{3, 0L}};
single_value->lower_bounds = {{3, Literal::String("abc").Serialize().value()}};
single_value->upper_bounds = {{3, Literal::String("abc").Serialize().value()}};
// single value is in the exclusion set -> rows cannot match
RunTest(
Expressions::NotIn("required", {Literal::String("abc"), Literal::String("def")}),
kRowCannotMatch, single_value);
// single value is not in the exclusion set -> rows might match
RunTest(
Expressions::NotIn("required", {Literal::String("def"), Literal::String("ghi")}),
kRowsMightMatch, single_value);

// single value but the file has nulls, which satisfy the notIn predicate
auto single_value_with_nulls = std::make_shared<DataFile>();
single_value_with_nulls->file_path = "single_value_nulls";
single_value_with_nulls->file_format = FileFormatType::kParquet;
single_value_with_nulls->record_count = 10;
single_value_with_nulls->value_counts = {{3, 10L}};
single_value_with_nulls->null_value_counts = {{3, 2L}};
single_value_with_nulls->nan_value_counts = {{3, 0L}};
single_value_with_nulls->lower_bounds = {
{3, Literal::String("abc").Serialize().value()}};
single_value_with_nulls->upper_bounds = {
{3, Literal::String("abc").Serialize().value()}};
RunTest(
Expressions::NotIn("required", {Literal::String("abc"), Literal::String("def")}),
kRowsMightMatch, single_value_with_nulls);

// single value but the file has NaNs, which satisfy the notIn predicate
auto single_value_with_nan = std::make_shared<DataFile>();
single_value_with_nan->file_path = "single_value_nan";
single_value_with_nan->file_format = FileFormatType::kParquet;
single_value_with_nan->record_count = 10;
single_value_with_nan->value_counts = {{9, 10L}};
single_value_with_nan->null_value_counts = {{9, 0L}};
single_value_with_nan->nan_value_counts = {{9, 2L}};
single_value_with_nan->lower_bounds = {{9, Literal::Float(5.0F).Serialize().value()}};
single_value_with_nan->upper_bounds = {{9, Literal::Float(5.0F).Serialize().value()}};
RunTest(Expressions::NotIn("no_nans", {Literal::Float(5.0F)}), kRowsMightMatch,
single_value_with_nan);

// bounds are NaN -> unreliable, cannot prune
auto single_value_nan_bounds = std::make_shared<DataFile>();
single_value_nan_bounds->file_path = "single_value_nan_bounds";
single_value_nan_bounds->file_format = FileFormatType::kParquet;
single_value_nan_bounds->record_count = 10;
single_value_nan_bounds->value_counts = {{9, 10L}};
single_value_nan_bounds->null_value_counts = {{9, 0L}};
single_value_nan_bounds->nan_value_counts = {{9, 0L}};
single_value_nan_bounds->lower_bounds = {
{9, Literal::Float(kFloatNan).Serialize().value()}};
single_value_nan_bounds->upper_bounds = {
{9, Literal::Float(kFloatNan).Serialize().value()}};
RunTest(Expressions::NotIn("no_nans", {Literal::Float(5.0F)}), kRowsMightMatch,
single_value_nan_bounds);
}

} // namespace iceberg
Loading