From 1f5438ebf1cfcdb120ee142a06b4113164058915 Mon Sep 17 00:00:00 2001 From: Zhuo Wang Date: Thu, 18 Jun 2026 23:31:11 +0800 Subject: [PATCH 1/2] fix: skip only null partition value rows in ParseDataFile The inner row loop in ParseDataFile() used break to skip null partition values, which aborted processing of all subsequent rows for that partition column. Use continue so only the current null row is skipped. Add a regression test that places a null partition value before non-null ones and verifies the later rows are still parsed. --- src/iceberg/manifest/manifest_reader.cc | 2 +- src/iceberg/test/manifest_reader_test.cc | 41 ++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 1 deletion(-) diff --git a/src/iceberg/manifest/manifest_reader.cc b/src/iceberg/manifest/manifest_reader.cc index bc7572591..54aceb247 100644 --- a/src/iceberg/manifest/manifest_reader.cc +++ b/src/iceberg/manifest/manifest_reader.cc @@ -477,7 +477,7 @@ Status ParseDataFile(const std::shared_ptr& data_file_schema, auto part_view = field_view->children[part_idx]; for (int64_t row_idx = 0; row_idx < part_view->length; row_idx++) { if (ArrowArrayViewIsNull(part_view, row_idx)) { - break; + continue; } ICEBERG_RETURN_UNEXPECTED( ParsePartitionValues(part_view, row_idx, manifest_entries)); diff --git a/src/iceberg/test/manifest_reader_test.cc b/src/iceberg/test/manifest_reader_test.cc index 4a17eacaa..bd0b6d86a 100644 --- a/src/iceberg/test/manifest_reader_test.cc +++ b/src/iceberg/test/manifest_reader_test.cc @@ -288,6 +288,47 @@ TEST_P(TestManifestReader, TestManifestReaderWithPartitionMetadata) { EXPECT_EQ(read_entry.data_file->partition.values()[0], Literal::Int(0)); } +TEST_P(TestManifestReader, NullPartitionValueDoesNotSkipSubsequentRows) { + auto version = GetParam(); + + // The first entry has a null partition value, followed by entries with + // non-null partition values. A null value must only skip its own row, not + // abort parsing of the remaining rows for that partition column (regression + // test: the inner loop previously used `break` instead of `continue`). + auto file_null = MakeDataFile("/path/to/data-null.parquet", + PartitionValues({Literal::Null(int32())})); + auto file_b = + MakeDataFile("/path/to/data-b.parquet", PartitionValues({Literal::Int(5)})); + auto file_c = + MakeDataFile("/path/to/data-c.parquet", PartitionValues({Literal::Int(7)})); + + std::vector entries; + entries.push_back(MakeEntry(ManifestStatus::kAdded, 1000L, std::move(file_null))); + entries.push_back(MakeEntry(ManifestStatus::kAdded, 1000L, std::move(file_b))); + entries.push_back(MakeEntry(ManifestStatus::kAdded, 1000L, std::move(file_c))); + + auto manifest = WriteManifest(version, /*snapshot_id=*/1000L, std::move(entries)); + + ICEBERG_UNWRAP_OR_FAIL(auto reader, + ManifestReader::Make(manifest, file_io_, schema_, spec_)); + ICEBERG_UNWRAP_OR_FAIL(auto read_entries, reader->Entries()); + + ASSERT_EQ(read_entries.size(), 3U); + + // The null-partition row contributes no partition value. + EXPECT_EQ(read_entries[0].data_file->file_path, "/path/to/data-null.parquet"); + EXPECT_EQ(read_entries[0].data_file->partition.num_fields(), 0); + + // Rows after the null one must still have their partition values parsed. + EXPECT_EQ(read_entries[1].data_file->file_path, "/path/to/data-b.parquet"); + ASSERT_EQ(read_entries[1].data_file->partition.num_fields(), 1); + EXPECT_EQ(read_entries[1].data_file->partition.values()[0], Literal::Int(5)); + + EXPECT_EQ(read_entries[2].data_file->file_path, "/path/to/data-c.parquet"); + ASSERT_EQ(read_entries[2].data_file->partition.num_fields(), 1); + EXPECT_EQ(read_entries[2].data_file->partition.values()[0], Literal::Int(7)); +} + TEST_P(TestManifestReader, ReadsEntriesWhenPartitionSourceFieldIsMissing) { auto version = GetParam(); auto file = MakeDataFile("/path/to/historical-data.parquet", From af6cc6e42d67147d6f1c64359daa5171b8c8ccc8 Mon Sep 17 00:00:00 2001 From: Zhuo Wang Date: Fri, 19 Jun 2026 01:03:23 +0800 Subject: [PATCH 2/2] fix: preserve null partition values as typed nulls in ParseDataFile --- src/iceberg/manifest/manifest_reader.cc | 68 +++++++++--------- src/iceberg/test/manifest_reader_test.cc | 88 ++++++++++++++++-------- 2 files changed, 96 insertions(+), 60 deletions(-) diff --git a/src/iceberg/manifest/manifest_reader.cc b/src/iceberg/manifest/manifest_reader.cc index 54aceb247..8757b5d61 100644 --- a/src/iceberg/manifest/manifest_reader.cc +++ b/src/iceberg/manifest/manifest_reader.cc @@ -389,39 +389,40 @@ Result> ParseManifestList(ArrowSchema* arrow_schema, } Status ParsePartitionValues(ArrowArrayView* view, int64_t row_idx, + const std::shared_ptr& field_type, std::vector& manifest_entries) { + auto& partition = manifest_entries[row_idx].data_file->partition; + if (view->storage_type == ArrowType::NANOARROW_TYPE_NA || + ArrowArrayViewIsNull(view, row_idx)) { + partition.AddValue(Literal::Null(field_type)); + return {}; + } switch (view->storage_type) { - case ArrowType::NANOARROW_TYPE_BOOL: { - auto value = ArrowArrayViewGetUIntUnsafe(view, row_idx); - manifest_entries[row_idx].data_file->partition.AddValue( - Literal::Boolean(value != 0)); - } break; - case ArrowType::NANOARROW_TYPE_INT32: { - auto value = ArrowArrayViewGetIntUnsafe(view, row_idx); - manifest_entries[row_idx].data_file->partition.AddValue(Literal::Int(value)); - } break; - case ArrowType::NANOARROW_TYPE_INT64: { - auto value = ArrowArrayViewGetIntUnsafe(view, row_idx); - manifest_entries[row_idx].data_file->partition.AddValue(Literal::Long(value)); - } break; - case ArrowType::NANOARROW_TYPE_FLOAT: { - auto value = ArrowArrayViewGetDoubleUnsafe(view, row_idx); - manifest_entries[row_idx].data_file->partition.AddValue(Literal::Float(value)); - } break; - case ArrowType::NANOARROW_TYPE_DOUBLE: { - auto value = ArrowArrayViewGetDoubleUnsafe(view, row_idx); - manifest_entries[row_idx].data_file->partition.AddValue(Literal::Double(value)); - } break; + case ArrowType::NANOARROW_TYPE_BOOL: + partition.AddValue( + Literal::Boolean(ArrowArrayViewGetUIntUnsafe(view, row_idx) != 0)); + break; + case ArrowType::NANOARROW_TYPE_INT32: + partition.AddValue(Literal::Int(ArrowArrayViewGetIntUnsafe(view, row_idx))); + break; + case ArrowType::NANOARROW_TYPE_INT64: + partition.AddValue(Literal::Long(ArrowArrayViewGetIntUnsafe(view, row_idx))); + break; + case ArrowType::NANOARROW_TYPE_FLOAT: + partition.AddValue(Literal::Float(ArrowArrayViewGetDoubleUnsafe(view, row_idx))); + break; + case ArrowType::NANOARROW_TYPE_DOUBLE: + partition.AddValue(Literal::Double(ArrowArrayViewGetDoubleUnsafe(view, row_idx))); + break; case ArrowType::NANOARROW_TYPE_STRING: { - auto value = ArrowArrayViewGetStringUnsafe(view, row_idx); - manifest_entries[row_idx].data_file->partition.AddValue( - Literal::String(std::string(value.data, value.size_bytes))); + auto str_value = ArrowArrayViewGetStringUnsafe(view, row_idx); + partition.AddValue( + Literal::String(std::string(str_value.data, str_value.size_bytes))); } break; case ArrowType::NANOARROW_TYPE_BINARY: { - auto buffer = ArrowArrayViewGetBytesUnsafe(view, row_idx); - manifest_entries[row_idx].data_file->partition.AddValue( - Literal::Binary(std::vector(buffer.data.as_char, - buffer.data.as_char + buffer.size_bytes))); + auto buf_value = ArrowArrayViewGetBytesUnsafe(view, row_idx); + partition.AddValue(Literal::Binary(std::vector( + buf_value.data.as_char, buf_value.data.as_char + buf_value.size_bytes))); } break; default: return InvalidManifest("Unsupported type {} for partition values", @@ -473,14 +474,15 @@ Status ParseDataFile(const std::shared_ptr& data_file_schema, case DataFile::kPartitionFieldId: { ICEBERG_RETURN_UNEXPECTED( AssertViewType(field_view, ArrowType::NANOARROW_TYPE_STRUCT, field_name)); + const auto& partition_type = + internal::checked_cast(*field->get().type()); for (int64_t part_idx = 0; part_idx < field_view->n_children; part_idx++) { auto part_view = field_view->children[part_idx]; + auto part_field_type = internal::checked_pointer_cast( + partition_type.fields()[part_idx].type()); for (int64_t row_idx = 0; row_idx < part_view->length; row_idx++) { - if (ArrowArrayViewIsNull(part_view, row_idx)) { - continue; - } - ICEBERG_RETURN_UNEXPECTED( - ParsePartitionValues(part_view, row_idx, manifest_entries)); + ICEBERG_RETURN_UNEXPECTED(ParsePartitionValues( + part_view, row_idx, part_field_type, manifest_entries)); } } } break; diff --git a/src/iceberg/test/manifest_reader_test.cc b/src/iceberg/test/manifest_reader_test.cc index bd0b6d86a..b57b0bc4a 100644 --- a/src/iceberg/test/manifest_reader_test.cc +++ b/src/iceberg/test/manifest_reader_test.cc @@ -288,45 +288,79 @@ TEST_P(TestManifestReader, TestManifestReaderWithPartitionMetadata) { EXPECT_EQ(read_entry.data_file->partition.values()[0], Literal::Int(0)); } -TEST_P(TestManifestReader, NullPartitionValueDoesNotSkipSubsequentRows) { +TEST_P(TestManifestReader, NullPartitionValuePreservedPositionallyAcrossRows) { auto version = GetParam(); - // The first entry has a null partition value, followed by entries with - // non-null partition values. A null value must only skip its own row, not - // abort parsing of the remaining rows for that partition column (regression - // test: the inner loop previously used `break` instead of `continue`). - auto file_null = MakeDataFile("/path/to/data-null.parquet", - PartitionValues({Literal::Null(int32())})); - auto file_b = - MakeDataFile("/path/to/data-b.parquet", PartitionValues({Literal::Int(5)})); - auto file_c = - MakeDataFile("/path/to/data-c.parquet", PartitionValues({Literal::Int(7)})); - - std::vector entries; - entries.push_back(MakeEntry(ManifestStatus::kAdded, 1000L, std::move(file_null))); - entries.push_back(MakeEntry(ManifestStatus::kAdded, 1000L, std::move(file_b))); - entries.push_back(MakeEntry(ManifestStatus::kAdded, 1000L, std::move(file_c))); - - auto manifest = WriteManifest(version, /*snapshot_id=*/1000L, std::move(entries)); - - ICEBERG_UNWRAP_OR_FAIL(auto reader, - ManifestReader::Make(manifest, file_io_, schema_, spec_)); + // Two partition fields, with several entries. The first entry has a null in + // partition field 0. The null must be preserved as a typed null in place so + // that (a) the tuple keeps one value per partition field (arity matches the + // spec), (b) the non-null value for field 1 stays in position 1 instead of + // shifting into position 0, and (c) parsing of the remaining rows is not + // aborted. This mirrors Iceberg Java's PartitionData, a fixed-size positional + // array that stores null in place. + auto multi_schema = std::make_shared(std::vector{ + SchemaField::MakeRequired(/*field_id=*/3, "id", int32()), + SchemaField::MakeRequired(/*field_id=*/4, "data", string())}); + + std::shared_ptr multi_spec; + ICEBERG_UNWRAP_OR_FAIL( + multi_spec, + PartitionSpec::Make( + /*spec_id=*/0, {PartitionField(/*source_id=*/3, /*field_id=*/1000, "id_part", + Transform::Identity()), + PartitionField(/*source_id=*/4, /*field_id=*/1001, + "data_bucket", Transform::Bucket(16))})); + + const std::string manifest_path = MakeManifestPath(); + auto writer_result = ManifestWriter::MakeWriter( + version, /*snapshot_id=*/1000L, manifest_path, file_io_, multi_spec, multi_schema, + ManifestContent::kData, /*first_row_id=*/0L); + ASSERT_THAT(writer_result, IsOk()); + auto writer = std::move(writer_result.value()); + + // First entry: partition field 0 (identity int) is null, field 1 (bucket) is 9. + auto file_null = + MakeDataFile("/path/to/data-null.parquet", + PartitionValues({Literal::Null(int32()), Literal::Int(9)})); + auto file_b = MakeDataFile("/path/to/data-b.parquet", + PartitionValues({Literal::Int(1), Literal::Int(5)})); + auto file_c = MakeDataFile("/path/to/data-c.parquet", + PartitionValues({Literal::Int(2), Literal::Int(7)})); + ASSERT_THAT( + writer->WriteEntry(MakeEntry(ManifestStatus::kAdded, 1000L, std::move(file_null))), + IsOk()); + ASSERT_THAT( + writer->WriteEntry(MakeEntry(ManifestStatus::kAdded, 1000L, std::move(file_b))), + IsOk()); + ASSERT_THAT( + writer->WriteEntry(MakeEntry(ManifestStatus::kAdded, 1000L, std::move(file_c))), + IsOk()); + ASSERT_THAT(writer->Close(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto manifest, writer->ToManifestFile()); + + ICEBERG_UNWRAP_OR_FAIL( + auto reader, ManifestReader::Make(manifest, file_io_, multi_schema, multi_spec)); ICEBERG_UNWRAP_OR_FAIL(auto read_entries, reader->Entries()); ASSERT_EQ(read_entries.size(), 3U); - // The null-partition row contributes no partition value. + // The null-partition row keeps both fields: a null in position 0 and the + // non-null value in position 1 (no shifting). EXPECT_EQ(read_entries[0].data_file->file_path, "/path/to/data-null.parquet"); - EXPECT_EQ(read_entries[0].data_file->partition.num_fields(), 0); + ASSERT_EQ(read_entries[0].data_file->partition.num_fields(), 2); + EXPECT_TRUE(read_entries[0].data_file->partition.values()[0].IsNull()); + EXPECT_EQ(read_entries[0].data_file->partition.values()[1], Literal::Int(9)); // Rows after the null one must still have their partition values parsed. EXPECT_EQ(read_entries[1].data_file->file_path, "/path/to/data-b.parquet"); - ASSERT_EQ(read_entries[1].data_file->partition.num_fields(), 1); - EXPECT_EQ(read_entries[1].data_file->partition.values()[0], Literal::Int(5)); + ASSERT_EQ(read_entries[1].data_file->partition.num_fields(), 2); + EXPECT_EQ(read_entries[1].data_file->partition.values()[0], Literal::Int(1)); + EXPECT_EQ(read_entries[1].data_file->partition.values()[1], Literal::Int(5)); EXPECT_EQ(read_entries[2].data_file->file_path, "/path/to/data-c.parquet"); - ASSERT_EQ(read_entries[2].data_file->partition.num_fields(), 1); - EXPECT_EQ(read_entries[2].data_file->partition.values()[0], Literal::Int(7)); + ASSERT_EQ(read_entries[2].data_file->partition.num_fields(), 2); + EXPECT_EQ(read_entries[2].data_file->partition.values()[0], Literal::Int(2)); + EXPECT_EQ(read_entries[2].data_file->partition.values()[1], Literal::Int(7)); } TEST_P(TestManifestReader, ReadsEntriesWhenPartitionSourceFieldIsMissing) {