From 81c306329d8a59aa6a76a91a210ea98bcb4ff192 Mon Sep 17 00:00:00 2001 From: "Chen, Junjie" Date: Sat, 8 Sep 2018 21:59:17 +0800 Subject: [PATCH 1/9] PARQUET-1328: Add Bloom filter reader and writer --- .../apache/parquet/cli/util/Expressions.java | 4 +- parquet-column/pom.xml | 1 - .../parquet/column/ParquetProperties.java | 63 ++- .../column/impl/ColumnWriteStoreV1.java | 20 +- .../column/impl/ColumnWriteStoreV2.java | 26 ++ .../parquet/column/impl/ColumnWriterV1.java | 51 ++- .../parquet/column/impl/ColumnWriterV2.java | 56 ++- .../values/bloomfilter/BloomFilter.java | 373 ++++++++++++++++++ .../bloomfilter/BloomFilterReadStore.java | 37 ++ .../values/bloomfilter/BloomFilterReader.java | 31 ++ .../bloomfilter/BloomFilterWriteStore.java | 34 ++ .../values/bloomfilter/BloomFilterWriter.java | 29 ++ .../converter/ParquetMetadataConverter.java | 2 + .../parquet/hadoop/BloomFilterDataReader.java | 80 ++++ .../hadoop/ColumnChunkPageWriteStore.java | 24 +- .../parquet/hadoop/ParquetFileReader.java | 41 +- .../parquet/hadoop/ParquetFileWriter.java | 14 + .../parquet/hadoop/ParquetInputFormat.java | 6 + .../parquet/hadoop/ParquetOutputFormat.java | 29 ++ .../hadoop/metadata/ColumnChunkMetaData.java | 70 ++++ .../parquet/hadoop/TestParquetFileWriter.java | 39 ++ pom.xml | 2 +- 22 files changed, 1006 insertions(+), 26 deletions(-) create mode 100644 parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BloomFilter.java create mode 100644 parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BloomFilterReadStore.java create mode 100644 parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BloomFilterReader.java create mode 100644 parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BloomFilterWriteStore.java create mode 100644 parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BloomFilterWriter.java create mode 100644 parquet-hadoop/src/main/java/org/apache/parquet/hadoop/BloomFilterDataReader.java diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/util/Expressions.java b/parquet-cli/src/main/java/org/apache/parquet/cli/util/Expressions.java index 06b28b46ae..d18ef559f2 100644 --- a/parquet-cli/src/main/java/org/apache/parquet/cli/util/Expressions.java +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/util/Expressions.java @@ -19,7 +19,7 @@ package org.apache.parquet.cli.util; -import com.google.common.base.Objects; +import com.google.common.base.MoreObjects; import com.google.common.base.Preconditions; import com.google.common.collect.Lists; import org.apache.avro.Schema; @@ -385,7 +385,7 @@ public int hashCode() { @Override public String toString() { - return Objects.toStringHelper(this) + return MoreObjects.toStringHelper(this) .add("type", type) .add("value", value) .add("children", children) diff --git a/parquet-column/pom.xml b/parquet-column/pom.xml index f85c8b908b..2c40029137 100644 --- a/parquet-column/pom.xml +++ b/parquet-column/pom.xml @@ -93,7 +93,6 @@ com.google.guava guava ${guava.version} - test diff --git a/parquet-column/src/main/java/org/apache/parquet/column/ParquetProperties.java b/parquet-column/src/main/java/org/apache/parquet/column/ParquetProperties.java index 39b65da9fa..94f1978f68 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/ParquetProperties.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/ParquetProperties.java @@ -18,21 +18,23 @@ */ package org.apache.parquet.column; +import static org.apache.parquet.bytes.BytesUtils.getWidthFromMaxInt; + +import java.util.HashMap; + import org.apache.parquet.Preconditions; import org.apache.parquet.bytes.ByteBufferAllocator; import org.apache.parquet.bytes.CapacityByteArrayOutputStream; import org.apache.parquet.bytes.HeapByteBufferAllocator; - -import static org.apache.parquet.bytes.BytesUtils.getWidthFromMaxInt; import org.apache.parquet.column.impl.ColumnWriteStoreV1; import org.apache.parquet.column.impl.ColumnWriteStoreV2; import org.apache.parquet.column.page.PageWriteStore; import org.apache.parquet.column.values.ValuesWriter; import org.apache.parquet.column.values.bitpacking.DevNullValuesWriter; import org.apache.parquet.column.values.factory.DefaultValuesWriterFactory; +import org.apache.parquet.column.values.factory.ValuesWriterFactory; import org.apache.parquet.column.values.rle.RunLengthBitPackingHybridEncoder; import org.apache.parquet.column.values.rle.RunLengthBitPackingHybridValuesWriter; -import org.apache.parquet.column.values.factory.ValuesWriterFactory; import org.apache.parquet.schema.MessageType; /** @@ -47,6 +49,7 @@ public class ParquetProperties { public static final boolean DEFAULT_ESTIMATE_ROW_COUNT_FOR_PAGE_SIZE_CHECK = true; public static final int DEFAULT_MINIMUM_RECORD_COUNT_FOR_CHECK = 100; public static final int DEFAULT_MAXIMUM_RECORD_COUNT_FOR_CHECK = 10000; + public static final boolean DEFAULT_BLOOM_FILTER_ENABLED = false; public static final ValuesWriterFactory DEFAULT_VALUES_WRITER_FACTORY = new DefaultValuesWriterFactory(); @@ -83,10 +86,12 @@ public static WriterVersion fromString(String name) { private final boolean estimateNextSizeCheck; private final ByteBufferAllocator allocator; private final ValuesWriterFactory valuesWriterFactory; + private final boolean enableBloomFilter; + private final HashMap bloomFilterInfo; private ParquetProperties(WriterVersion writerVersion, int pageSize, int dictPageSize, boolean enableDict, int minRowCountForPageSizeCheck, int maxRowCountForPageSizeCheck, boolean estimateNextSizeCheck, ByteBufferAllocator allocator, - ValuesWriterFactory writerFactory) { + ValuesWriterFactory writerFactory, boolean enableBloomFilter, HashMap bloomFilterInfo) { this.pageSizeThreshold = pageSize; this.initialSlabSize = CapacityByteArrayOutputStream .initialSlabSizeHeuristic(MIN_SLAB_SIZE, pageSizeThreshold, 10); @@ -97,7 +102,8 @@ private ParquetProperties(WriterVersion writerVersion, int pageSize, int dictPag this.maxRowCountForPageSizeCheck = maxRowCountForPageSizeCheck; this.estimateNextSizeCheck = estimateNextSizeCheck; this.allocator = allocator; - + this.enableBloomFilter = enableBloomFilter; + this.bloomFilterInfo = bloomFilterInfo; this.valuesWriterFactory = writerFactory; } @@ -159,6 +165,14 @@ public ByteBufferAllocator getAllocator() { return allocator; } + public boolean isBloomFilterEnabled() { + return enableBloomFilter; + } + + public HashMap getBloomFilterInfo() { + return bloomFilterInfo; + } + public ColumnWriteStore newColumnWriteStore(MessageType schema, PageWriteStore pageStore) { switch (writerVersion) { @@ -199,6 +213,8 @@ public static class Builder { private int pageSize = DEFAULT_PAGE_SIZE; private int dictPageSize = DEFAULT_DICTIONARY_PAGE_SIZE; private boolean enableDict = DEFAULT_IS_DICTIONARY_ENABLED; + private boolean enableBloomFilter = DEFAULT_BLOOM_FILTER_ENABLED; + private HashMap bloomFilterInfo = new HashMap<>(); private WriterVersion writerVersion = DEFAULT_WRITER_VERSION; private int minRowCountForPageSizeCheck = DEFAULT_MINIMUM_RECORD_COUNT_FOR_CHECK; private int maxRowCountForPageSizeCheck = DEFAULT_MAXIMUM_RECORD_COUNT_FOR_CHECK; @@ -217,6 +233,8 @@ private Builder(ParquetProperties toCopy) { this.maxRowCountForPageSizeCheck = toCopy.maxRowCountForPageSizeCheck; this.estimateNextSizeCheck = toCopy.estimateNextSizeCheck; this.allocator = toCopy.allocator; + this.enableBloomFilter = toCopy.enableBloomFilter; + this.bloomFilterInfo = toCopy.bloomFilterInfo; } /** @@ -256,6 +274,38 @@ public Builder withDictionaryPageSize(int dictionaryPageSize) { return this; } + /** + * Set to enable Bloom filter. + * + * @param enableBloomFilter a boolean to indicate whether to enable Bloom filter. + * @return this builder for method chaining. + */ + public Builder withBloomFilterEnabled(boolean enableBloomFilter) { + this.enableBloomFilter = enableBloomFilter; + return this; + } + + /** + * Set Bloom filter info for columns. + * + * @param names the columns to be enable for Bloom filter + * @param sizes the sizes corresponding to columns + * @return this builder for method chaining + */ + public Builder withBloomFilterInfo(String names, String sizes) { + String[] bloomFilterColumns = names.split(","); + String[] bloomFilterSizes = sizes.split(","); + + Preconditions.checkArgument(bloomFilterColumns.length == bloomFilterSizes.length, + "Column names are not matched to sizes"); + + for (int i = 0; i < bloomFilterColumns.length; i++) { + bloomFilterInfo.put(bloomFilterColumns[i], Long.getLong(bloomFilterSizes[i])); + } + + return this; + } + /** * Set the {@link WriterVersion format version}. * @@ -303,7 +353,8 @@ public ParquetProperties build() { ParquetProperties properties = new ParquetProperties(writerVersion, pageSize, dictPageSize, enableDict, minRowCountForPageSizeCheck, maxRowCountForPageSizeCheck, - estimateNextSizeCheck, allocator, valuesWriterFactory); + estimateNextSizeCheck, allocator, valuesWriterFactory, + enableBloomFilter, bloomFilterInfo); // we pass a constructed but uninitialized factory to ParquetProperties above as currently // creation of ValuesWriters is invoked from within ParquetProperties. In the future // we'd like to decouple that and won't need to pass an object to properties and then pass the diff --git a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriteStoreV1.java b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriteStoreV1.java index 93a497fad8..bd401430ad 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriteStoreV1.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriteStoreV1.java @@ -25,20 +25,21 @@ import java.util.Set; import java.util.TreeMap; -import org.apache.parquet.bytes.ByteBufferAllocator; import org.apache.parquet.column.ColumnDescriptor; import org.apache.parquet.column.ColumnWriteStore; import org.apache.parquet.column.ColumnWriter; import org.apache.parquet.column.ParquetProperties; -import org.apache.parquet.column.ParquetProperties.WriterVersion; import org.apache.parquet.column.page.PageWriteStore; import org.apache.parquet.column.page.PageWriter; +import org.apache.parquet.column.values.bloomfilter.BloomFilterWriteStore; +import org.apache.parquet.column.values.bloomfilter.BloomFilterWriter; public class ColumnWriteStoreV1 implements ColumnWriteStore { private final Map columns = new TreeMap(); private final PageWriteStore pageWriteStore; private final ParquetProperties props; + private BloomFilterWriteStore bloomFilterWriteStore; public ColumnWriteStoreV1(PageWriteStore pageWriteStore, ParquetProperties props) { @@ -46,6 +47,13 @@ public ColumnWriteStoreV1(PageWriteStore pageWriteStore, this.props = props; } + public ColumnWriteStoreV1(PageWriteStore pageWriteStore, + BloomFilterWriteStore bloomFilterWriteStore, + ParquetProperties props) { + this (pageWriteStore, props); + this.bloomFilterWriteStore = bloomFilterWriteStore; + } + public ColumnWriter getColumnWriter(ColumnDescriptor path) { ColumnWriterV1 column = columns.get(path); if (column == null) { @@ -61,7 +69,13 @@ public Set getColumnDescriptors() { private ColumnWriterV1 newMemColumn(ColumnDescriptor path) { PageWriter pageWriter = pageWriteStore.getPageWriter(path); - return new ColumnWriterV1(path, pageWriter, props); + + if (props.isBloomFilterEnabled() && props.getBloomFilterInfo() != null) { + BloomFilterWriter bloomFilterWriter = bloomFilterWriteStore.getBloomFilterWriter(path); + return new ColumnWriterV1(path, pageWriter, bloomFilterWriter, props); + } else { + return new ColumnWriterV1(path, pageWriter, props); + } } @Override diff --git a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriteStoreV2.java b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriteStoreV2.java index 7574cedf75..057660ff5d 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriteStoreV2.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriteStoreV2.java @@ -35,6 +35,8 @@ import org.apache.parquet.column.ParquetProperties; import org.apache.parquet.column.page.PageWriteStore; import org.apache.parquet.column.page.PageWriter; +import org.apache.parquet.column.values.bloomfilter.BloomFilterWriteStore; +import org.apache.parquet.column.values.bloomfilter.BloomFilterWriter; import org.apache.parquet.schema.MessageType; public class ColumnWriteStoreV2 implements ColumnWriteStore { @@ -66,6 +68,30 @@ public ColumnWriteStoreV2( this.rowCountForNextSizeCheck = props.getMinRowCountForPageSizeCheck(); } + public ColumnWriteStoreV2( + MessageType schema, + PageWriteStore pageWriteStore, + BloomFilterWriteStore bloomFilterWriteStore, + ParquetProperties props) { + this.props = props; + this.thresholdTolerance = (long)(props.getPageSizeThreshold() * THRESHOLD_TOLERANCE_RATIO); + Map mcolumns = new TreeMap(); + + for (ColumnDescriptor path : schema.getColumns()) { + PageWriter pageWriter = pageWriteStore.getPageWriter(path); + if (props.isBloomFilterEnabled() && props.getBloomFilterInfo() != null) { + BloomFilterWriter bloomFilterWriter = bloomFilterWriteStore.getBloomFilterWriter(path); + mcolumns.put(path, new ColumnWriterV2(path, pageWriter, bloomFilterWriter, props)); + } else { + mcolumns.put(path, new ColumnWriterV2(path, pageWriter, props)); + } + } + this.columns = unmodifiableMap(mcolumns); + this.writers = this.columns.values(); + + this.rowCountForNextSizeCheck = props.getMinRowCountForPageSizeCheck(); + } + public ColumnWriter getColumnWriter(ColumnDescriptor path) { return columns.get(path); } diff --git a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV1.java b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV1.java index c1f5d67b01..daf51cfa0d 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV1.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV1.java @@ -21,6 +21,7 @@ import static org.apache.parquet.bytes.BytesInput.concat; import java.io.IOException; +import java.util.HashMap; import org.apache.parquet.column.ColumnDescriptor; import org.apache.parquet.column.ColumnWriter; @@ -29,6 +30,8 @@ import org.apache.parquet.column.page.PageWriter; import org.apache.parquet.column.statistics.Statistics; import org.apache.parquet.column.values.ValuesWriter; +import org.apache.parquet.column.values.bloomfilter.BloomFilter; +import org.apache.parquet.column.values.bloomfilter.BloomFilterWriter; import org.apache.parquet.io.ParquetEncodingException; import org.apache.parquet.io.api.Binary; import org.slf4j.Logger; @@ -55,6 +58,23 @@ final class ColumnWriterV1 implements ColumnWriter { private int valueCountForNextSizeCheck; private Statistics statistics; + private BloomFilterWriter bloomFilterWriter; + private BloomFilter bloomFilter; + + public ColumnWriterV1(ColumnDescriptor path, PageWriter pageWriter, + BloomFilterWriter bloomFilterWriter, ParquetProperties props) { + this(path, pageWriter, props); + + // Current not support nested column. + if (path.getPath().length == 1) { + this.bloomFilterWriter = bloomFilterWriter; + HashMap bloomFilterInfo = props.getBloomFilterInfo(); + String column = path.getPath()[0]; + if (bloomFilterInfo.keySet().contains(column)) { + this.bloomFilter = new BloomFilter(bloomFilterInfo.get(column).intValue()); + } + } + } public ColumnWriterV1(ColumnDescriptor path, PageWriter pageWriter, ParquetProperties props) { @@ -177,6 +197,9 @@ public void write(double value, int repetitionLevel, int definitionLevel) { definitionLevelColumn.writeInteger(definitionLevel); dataColumn.writeDouble(value); updateStatistics(value); + if (bloomFilter != null) { + bloomFilter.insert(bloomFilter.hash(value)); + } accountForValueWritten(); } @@ -187,6 +210,9 @@ public void write(float value, int repetitionLevel, int definitionLevel) { definitionLevelColumn.writeInteger(definitionLevel); dataColumn.writeFloat(value); updateStatistics(value); + if (bloomFilter != null) { + bloomFilter.insert(bloomFilter.hash(value)); + } accountForValueWritten(); } @@ -197,6 +223,9 @@ public void write(Binary value, int repetitionLevel, int definitionLevel) { definitionLevelColumn.writeInteger(definitionLevel); dataColumn.writeBytes(value); updateStatistics(value); + if (bloomFilter != null) { + bloomFilter.insert(bloomFilter.hash(value)); + } accountForValueWritten(); } @@ -217,6 +246,9 @@ public void write(int value, int repetitionLevel, int definitionLevel) { definitionLevelColumn.writeInteger(definitionLevel); dataColumn.writeInteger(value); updateStatistics(value); + if (bloomFilter != null) { + bloomFilter.insert(bloomFilter.hash(value)); + } accountForValueWritten(); } @@ -227,6 +259,9 @@ public void write(long value, int repetitionLevel, int definitionLevel) { definitionLevelColumn.writeInteger(definitionLevel); dataColumn.writeLong(value); updateStatistics(value); + if (bloomFilter != null) { + bloomFilter.insert(bloomFilter.hash(value)); + } accountForValueWritten(); } @@ -244,6 +279,10 @@ public void flush() { } dataColumn.resetDictionary(); } + + if (bloomFilterWriter != null && bloomFilter != null) { + bloomFilterWriter.writeBloomFilter(bloomFilter); + } } @Override @@ -257,17 +296,21 @@ public void close() { @Override public long getBufferedSizeInMemory() { + long bloomBufferSize = bloomFilter == null ? 0 : bloomFilter.getBufferedSize(); return repetitionLevelColumn.getBufferedSize() + definitionLevelColumn.getBufferedSize() + dataColumn.getBufferedSize() - + pageWriter.getMemSize(); + + pageWriter.getMemSize() + + bloomBufferSize; } public long allocatedSize() { + long bloomAllocatedSize = bloomFilter == null ? 0 : bloomFilter.getBufferedSize(); return repetitionLevelColumn.getAllocatedSize() - + definitionLevelColumn.getAllocatedSize() - + dataColumn.getAllocatedSize() - + pageWriter.allocatedSize(); + + definitionLevelColumn.getAllocatedSize() + + dataColumn.getAllocatedSize() + + pageWriter.allocatedSize() + + bloomAllocatedSize; } public String memUsageString(String indent) { diff --git a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV2.java b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV2.java index 9abdee8a52..e041e97b57 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV2.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV2.java @@ -19,6 +19,7 @@ package org.apache.parquet.column.impl; import java.io.IOException; +import java.util.HashMap; import org.apache.parquet.Ints; import org.apache.parquet.bytes.BytesInput; @@ -30,6 +31,8 @@ import org.apache.parquet.column.page.PageWriter; import org.apache.parquet.column.statistics.Statistics; import org.apache.parquet.column.values.ValuesWriter; +import org.apache.parquet.column.values.bloomfilter.BloomFilter; +import org.apache.parquet.column.values.bloomfilter.BloomFilterWriter; import org.apache.parquet.column.values.rle.RunLengthBitPackingHybridEncoder; import org.apache.parquet.io.ParquetEncodingException; import org.apache.parquet.io.api.Binary; @@ -53,6 +56,9 @@ final class ColumnWriterV2 implements ColumnWriter { private ValuesWriter dataColumn; private int valueCount; + private BloomFilterWriter bloomFilterWriter; + private BloomFilter bloomFilter; + private Statistics statistics; private long rowsWrittenSoFar = 0; @@ -69,6 +75,25 @@ public ColumnWriterV2( this.dataColumn = props.newValuesWriter(path); } + public ColumnWriterV2( + ColumnDescriptor path, + PageWriter pageWriter, + BloomFilterWriter bloomFilterWriter, + ParquetProperties props) { + this(path, pageWriter, props); + + this.bloomFilterWriter = bloomFilterWriter; + HashMap bloomFilterInfo = props.getBloomFilterInfo(); + + // Current not support nested column. + if (path.getPath().length == 1) { + String column = path.getPath()[0]; + if (bloomFilterInfo.keySet().contains(column)) { + this.bloomFilter = new BloomFilter(bloomFilterInfo.get(column).intValue()); + } + } + } + private void log(Object value, int r, int d) { LOG.debug("{} {} r:{} d:{}", path, value, r, d); } @@ -134,6 +159,9 @@ public void write(double value, int repetitionLevel, int definitionLevel) { definitionLevel(definitionLevel); dataColumn.writeDouble(value); statistics.updateStats(value); + if (bloomFilter != null) { + bloomFilter.insert(bloomFilter.hash(value)); + } ++ valueCount; } @@ -149,6 +177,9 @@ public void write(float value, int repetitionLevel, int definitionLevel) { definitionLevel(definitionLevel); dataColumn.writeFloat(value); statistics.updateStats(value); + if (bloomFilter != null) { + bloomFilter.insert(bloomFilter.hash(value)); + } ++ valueCount; } @@ -164,6 +195,9 @@ public void write(Binary value, int repetitionLevel, int definitionLevel) { definitionLevel(definitionLevel); dataColumn.writeBytes(value); statistics.updateStats(value); + if (bloomFilter != null) { + bloomFilter.insert(bloomFilter.hash(value)); + } ++ valueCount; } @@ -194,6 +228,9 @@ public void write(int value, int repetitionLevel, int definitionLevel) { definitionLevel(definitionLevel); dataColumn.writeInteger(value); statistics.updateStats(value); + if (bloomFilter != null) { + bloomFilter.insert(bloomFilter.hash(value)); + } ++ valueCount; } @@ -209,6 +246,9 @@ public void write(long value, int repetitionLevel, int definitionLevel) { definitionLevel(definitionLevel); dataColumn.writeLong(value); statistics.updateStats(value); + if (bloomFilter != null) { + bloomFilter.insert(bloomFilter.hash(value)); + } ++ valueCount; } @@ -227,6 +267,10 @@ public void finalizeColumnChunk() { } dataColumn.resetDictionary(); } + + if (bloomFilterWriter != null && bloomFilter != null) { + bloomFilterWriter.writeBloomFilter(bloomFilter); + } } /** @@ -234,9 +278,11 @@ public void finalizeColumnChunk() { * @return the number of bytes of memory used to buffer the current data */ public long getCurrentPageBufferedSize() { + long bloomBufferSize = bloomFilter == null ? 0 : bloomFilter.getBufferedSize(); return repetitionLevelColumn.getBufferedSize() + definitionLevelColumn.getBufferedSize() - + dataColumn.getBufferedSize(); + + dataColumn.getBufferedSize() + + bloomBufferSize; } /** @@ -244,20 +290,24 @@ public long getCurrentPageBufferedSize() { * @return the number of bytes of memory used to buffer the current data and the previously written pages */ public long getTotalBufferedSize() { + long bloomBufferSize = bloomFilter == null ? 0 : bloomFilter.getBufferedSize(); return repetitionLevelColumn.getBufferedSize() + definitionLevelColumn.getBufferedSize() + dataColumn.getBufferedSize() - + pageWriter.getMemSize(); + + pageWriter.getMemSize() + + bloomBufferSize; } /** * @return actual memory used */ public long allocatedSize() { + long bloomFilterSize = bloomFilter == null ? 0 : bloomFilter.getBufferedSize(); return repetitionLevelColumn.getAllocatedSize() + definitionLevelColumn.getAllocatedSize() + dataColumn.getAllocatedSize() - + pageWriter.allocatedSize(); + + pageWriter.allocatedSize() + + bloomFilterSize; } /** diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BloomFilter.java b/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BloomFilter.java new file mode 100644 index 0000000000..4548617b1b --- /dev/null +++ b/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BloomFilter.java @@ -0,0 +1,373 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.column.values.bloomfilter; + +import com.google.common.hash.HashFunction; +import com.google.common.hash.Hashing; +import org.apache.parquet.Preconditions; +import org.apache.parquet.bytes.BytesUtils; +import org.apache.parquet.io.api.Binary; + +import java.io.IOException; +import java.io.OutputStream; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.IntBuffer; + +/** + * A Bloom filter is a compact structure to indicate whether an item is not in a set or probably + * in a set. BloomFilter class stores a bit set represents a elements set, a hash strategy and a + * Bloom filter algorithm. + * + * This Bloom filter is implemented using block-based Bloom filter algorithm from Putze et al.'s + * "Cache-, Hash- and Space-Efficient Bloom filters". The basic idea is to hash the item to a tiny + * Bloom filter which size fit a single cache line or smaller. This implementation sets 8 bits in + * each tiny Bloom filter. Each tiny Bloom filter is 32 bytes to take advantage of 32-byte SIMD + * instruction. + */ + +public class BloomFilter { + // Bloom filter Hash strategy . + public enum HashStrategy { + MURMUR3_X64_128, + } + + // Bloom filter algorithm. + public enum Algorithm { + BLOCK, + } + + // Bytes in a tiny Bloom filter block. + private static final int BYTES_PER_FILTER_BLOCK = 32; + + // Default seed for hash function, it comes from System.nanoTime(). + private static final int DEFAULT_SEED = 1361930890; + + // Minimum Bloom filter size, set to size of a tiny Bloom filter block + public static final int MINIMUM_BLOOM_FILTER_BYTES = 32; + + // Maximum Bloom filter size, it sets to default HDFS block size for upper boundary check + // This should be re-consider when implementing write side logic. + public static final int MAXIMUM_BLOOM_FILTER_BYTES = 128 * 1024 * 1024; + + // The number of bits to set in a tiny Bloom filter + private static final int BITS_SET_PER_BLOCK = 8; + + // The header of Bloom filter, it includes number of bytes, algorithm and hash enumeration. + public static final int HEADER_SIZE = 12; + + // Hash strategy used in this Bloom filter. + public final HashStrategy hashStrategy; + + // Algorithm used in this Bloom filter. + public final Algorithm algorithm; + + // The underlying byte array for Bloom filter bitset. + private byte[] bitset; + + // A integer array buffer of underlying bitset to help setting bits. + private IntBuffer intBuffer; + + // Hash function use to compute hash for column value. + private HashFunction hashFunction; + + // The block-based algorithm needs 8 odd SALT values to calculate eight index + // of bit to set, one bit in 32-bit word. + private static final int SALT[] = {0x47b6137b, 0x44974d91, 0x8824ad5b, 0xa2b7289d, + 0x705495c7, 0x2df1424b, 0x9efc4947, 0x5c6bfb31}; + + /** + * Constructor of Bloom filter. + * + * @param numBytes The number of bytes for Bloom filter bitset. The range of num_bytes should be within + * [MINIMUM_BLOOM_FILTER_BYTES, MAXIMUM_BLOOM_FILTER_BYTES], it will be rounded up/down + * to lower/upper bound if num_bytes is out of range and also will rounded up to a power + * of 2. It uses murmur3_x64_128 as its default hash function and block-based algorithm + * as default algorithm. + */ + public BloomFilter(int numBytes) { + this(numBytes, HashStrategy.MURMUR3_X64_128, Algorithm.BLOCK); + } + + /** + * Constructor of Bloom filter. It uses murmur3_x64_128 as its default hash + * function and block-based algorithm as its default algorithm. + * + * @param numBytes The number of bytes for Bloom filter bitset + * @param hashStrategy The hash strategy of Bloom filter. + * @param algorithm The algorithm of Bloom filter. + */ + private BloomFilter(int numBytes, HashStrategy hashStrategy, Algorithm algorithm) { + initBitset(numBytes); + + switch (hashStrategy) { + case MURMUR3_X64_128: + this.hashStrategy = hashStrategy; + hashFunction = Hashing.murmur3_128(DEFAULT_SEED); + break; + default: + throw new RuntimeException("Not supported hash strategy"); + } + + this.algorithm = algorithm; + } + + + /** + * Construct the Bloom filter with given bitset, it is used when reconstructing + * Bloom filter from parquet file. It use murmur3_x64_128 as its default hash + * function and block-based algorithm as default algorithm. + * + * @param bitset The given bitset to construct Bloom filter. + */ + public BloomFilter(byte[] bitset) { + this(bitset, HashStrategy.MURMUR3_X64_128, Algorithm.BLOCK); + } + + /** + * Construct the Bloom filter with given bitset, it is used when reconstructing + * Bloom filter from parquet file. + * + * @param bitset The given bitset to construct Bloom filter. + * @param hashStrategy The hash strategy Bloom filter apply. + * @param algorithm The algorithm of Bloom filter. + */ + private BloomFilter(byte[] bitset, HashStrategy hashStrategy, Algorithm algorithm) { + if (bitset == null) { + throw new RuntimeException("Given bitset is null"); + } + this.bitset = bitset; + this.intBuffer = ByteBuffer.wrap(bitset).order(ByteOrder.LITTLE_ENDIAN).asIntBuffer(); + + switch (hashStrategy) { + case MURMUR3_X64_128: + this.hashStrategy = hashStrategy; + hashFunction = Hashing.murmur3_128(DEFAULT_SEED); + break; + default: + throw new RuntimeException("Not supported hash strategy"); + } + this.algorithm = algorithm; + } + + /** + * Create a new bitset for Bloom filter. + * + * @param numBytes The number of bytes for Bloom filter bitset. The range of num_bytes should be within + * [MINIMUM_BLOOM_FILTER_BYTES, MAXIMUM_BLOOM_FILTER_BYTES], it will be rounded up/down + * to lower/upper bound if num_bytes is out of range and also will rounded up to a power + * of 2. It uses murmur3_x64_128 as its default hash function and block-based algorithm + * as default algorithm. + */ + private void initBitset(int numBytes) { + if (numBytes < MINIMUM_BLOOM_FILTER_BYTES) { + numBytes = MINIMUM_BLOOM_FILTER_BYTES; + } + + // Get next power of 2 if it is not power of 2. + if ((numBytes & (numBytes - 1)) != 0) { + numBytes = Integer.highestOneBit(numBytes) << 1; + } + + if (numBytes > MAXIMUM_BLOOM_FILTER_BYTES || numBytes < 0) { + numBytes = MAXIMUM_BLOOM_FILTER_BYTES; + } + + this.bitset = new byte[numBytes]; + this.intBuffer = ByteBuffer.wrap(bitset).order(ByteOrder.LITTLE_ENDIAN).asIntBuffer(); + } + + /** + * Write the Bloom filter to an output stream. It writes the Bloom filter header includes the + * bitset's length in size of byte, the hash strategy, the algorithm, and the bitset. + * + * @param out the output stream to write + */ + public void writeTo(OutputStream out) throws IOException { + // Write number of bytes of bitset. + out.write(BytesUtils.intToBytes(bitset.length)); + + // Write hash strategy + out.write(BytesUtils.intToBytes(this.hashStrategy.ordinal())); + + // Write algorithm + out.write(BytesUtils.intToBytes(this.algorithm.ordinal())); + + // Write bitset + out.write(bitset); + } + + private int[] setMask(int key) { + int mask[] = new int[BITS_SET_PER_BLOCK]; + + for (int i = 0; i < BITS_SET_PER_BLOCK; ++i) { + mask[i] = key * SALT[i]; + } + + for (int i = 0; i < BITS_SET_PER_BLOCK; ++i) { + mask[i] = mask[i] >>> 27; + } + + for (int i = 0; i < BITS_SET_PER_BLOCK; ++i) { + mask[i] = 0x1 << mask[i]; + } + + return mask; + } + + /** + * Add an element to Bloom filter, the element content is represented by + * the hash value of its plain encoding result. + * + * @param hash the hash result of element. + */ + public void insert(long hash) { + int bucketIndex = (int)(hash >> 32) & (bitset.length / BYTES_PER_FILTER_BLOCK - 1); + int key = (int)hash; + + // Calculate mask for bucket. + int mask[] = setMask(key); + + for (int i = 0; i < BITS_SET_PER_BLOCK; i++) { + int value = intBuffer.get(bucketIndex * (BYTES_PER_FILTER_BLOCK / 4) + i); + value |= mask[i]; + intBuffer.put(bucketIndex * (BYTES_PER_FILTER_BLOCK / 4) + i, value); + } + } + + /** + * Determine whether an element is in set or not. + * + * @param hash the hash value of element plain encoding result. + * @return false if element is must not in set, true if element probably in set. + */ + public boolean find(long hash) { + int bucketIndex = (int)(hash >> 32) & (bitset.length / BYTES_PER_FILTER_BLOCK - 1); + int key = (int)hash; + + // Calculate mask for the tiny Bloom filter. + int mask[] = setMask(key); + + for (int i = 0; i < BITS_SET_PER_BLOCK; i++) { + if (0 == (intBuffer.get(bucketIndex * (BYTES_PER_FILTER_BLOCK / 4) + i) & mask[i])) { + return false; + } + } + + return true; + } + + /** + * Calculate optimal size according to the number of distinct values and false positive probability. + * + * @param n: The number of distinct values. + * @param p: The false positive probability. + * @return optimal number of bits of given n and p. + */ + public static int optimalNumOfBits(long n, double p) { + Preconditions.checkArgument((p > 0.0 && p < 1.0), + "FPP should be less than 1.0 and great than 0.0"); + + final double m = -8 * n / Math.log(1 - Math.pow(p, 1.0 / 8)); + final double MAX = MAXIMUM_BLOOM_FILTER_BYTES << 3; + int numBits = (int)m; + + // Handle overflow. + if (m > MAX || m < 0) { + numBits = (int)MAX; + } + + // Get next power of 2 if bits is not power of 2. + if ((numBits & (numBits - 1)) != 0) { + numBits = Integer.highestOneBit(numBits) << 1; + } + + if (numBits < (MINIMUM_BLOOM_FILTER_BYTES << 3)) { + numBits = MINIMUM_BLOOM_FILTER_BYTES << 3; + } + + return numBits; + } + + /** + * Compute hash for int value by using its plain encoding result. + * + * @param value the value to hash + * @return hash result + */ + public long hash(int value) { + ByteBuffer plain = ByteBuffer.allocate(Integer.SIZE/Byte.SIZE); + plain.order(ByteOrder.LITTLE_ENDIAN).putInt(value); + return hashFunction.hashBytes(plain.array()).asLong(); + } + + /** + * Compute hash for long value by using its plain encoding result. + * + * @param value the value to hash + * @return hash result + */ + public long hash(long value) { + ByteBuffer plain = ByteBuffer.allocate(Long.SIZE/Byte.SIZE); + plain.order(ByteOrder.LITTLE_ENDIAN).putLong(value); + return hashFunction.hashBytes(plain.array()).asLong(); + } + + /** + * Compute hash for double value by using its plain encoding result. + * + * @param value the value to hash + * @return hash result + */ + public long hash(double value) { + ByteBuffer plain = ByteBuffer.allocate(Double.SIZE/Byte.SIZE); + plain.order(ByteOrder.LITTLE_ENDIAN).putDouble(value); + return hashFunction.hashBytes(plain.array()).asLong(); + } + + /** + * Compute hash for float value by using its plain encoding result. + * + * @param value the value to hash + * @return hash result + */ + public long hash(float value) { + ByteBuffer plain = ByteBuffer.allocate(Float.SIZE/Byte.SIZE); + plain.order(ByteOrder.LITTLE_ENDIAN).putFloat(value); + return hashFunction.hashBytes(plain.array()).asLong(); + } + + /** + * Compute hash for Binary value by using its plain encoding result. + * + * @param value the value to hash + * @return hash result + */ + public long hash(Binary value) { + return hashFunction.hashBytes(value.toByteBuffer()).asLong(); + } + + /** + * Get allocated buffer size. + * @return size in byte. + */ + public long getBufferedSize() { + return this.bitset.length; + } +} diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BloomFilterReadStore.java b/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BloomFilterReadStore.java new file mode 100644 index 0000000000..bdc51755b0 --- /dev/null +++ b/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BloomFilterReadStore.java @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.column.values.bloomfilter; + +import org.apache.parquet.column.ColumnDescriptor; + +/** + * contains all the bloom filter reader for all columns of a row group + */ + +public interface BloomFilterReadStore { + /** + * Get a Bloom filter reader of a column + * + * @param path the descriptor of the column + * @return the corresponding Bloom filter writer + */ + BloomFilterReader getBloomFilterReader(ColumnDescriptor path); +} + + diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BloomFilterReader.java b/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BloomFilterReader.java new file mode 100644 index 0000000000..39b25e2a49 --- /dev/null +++ b/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BloomFilterReader.java @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.column.values.bloomfilter; + +import org.apache.parquet.column.ColumnDescriptor; + +public interface BloomFilterReader { + /** + * Returns a {@link BloomFilter} for the given column descriptor. + * + * @param path the descriptor of the column + * @return the bloomFilter dta for that column, or null if there isn't one + */ + BloomFilter readBloomFilter(ColumnDescriptor path); +} diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BloomFilterWriteStore.java b/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BloomFilterWriteStore.java new file mode 100644 index 0000000000..f472104daa --- /dev/null +++ b/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BloomFilterWriteStore.java @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.column.values.bloomfilter; + +import org.apache.parquet.column.ColumnDescriptor; + +/** + * Contains all writers for all columns of a row group + */ +public interface BloomFilterWriteStore { + /** + * Get bloom filter writer of a column + * + * @param path the descriptor for the column + * @return the corresponding Bloom filter writer + */ + BloomFilterWriter getBloomFilterWriter(ColumnDescriptor path); +} diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BloomFilterWriter.java b/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BloomFilterWriter.java new file mode 100644 index 0000000000..388e779968 --- /dev/null +++ b/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BloomFilterWriter.java @@ -0,0 +1,29 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.column.values.bloomfilter; + +public interface BloomFilterWriter { + /** + * Write a bloom filter + * + * @param bloomFilter the bloom filter to write + * + */ + void writeBloomFilter(BloomFilter bloomFilter); +} diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java index 1442910c8c..7f2a766a47 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java @@ -439,6 +439,7 @@ private void addRowGroup(ParquetMetadata parquetMetadata, List rowGrou columnMetaData.getTotalSize(), columnMetaData.getFirstDataPageOffset()); columnChunk.meta_data.dictionary_page_offset = columnMetaData.getDictionaryPageOffset(); + columnChunk.meta_data.setBloom_filter_offset(columnMetaData.getBloomFilterOffset()); if (!columnMetaData.getStatistics().isEmpty()) { columnChunk.meta_data.setStatistics(toParquetStatistics(columnMetaData.getStatistics())); } @@ -1112,6 +1113,7 @@ public ParquetMetadata fromParquetMetadata(FileMetaData parquetMetadata) throws messageType.getType(path.toArray()).asPrimitiveType()), metaData.data_page_offset, metaData.dictionary_page_offset, + metaData.bloom_filter_offset, metaData.num_values, metaData.total_compressed_size, metaData.total_uncompressed_size); diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/BloomFilterDataReader.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/BloomFilterDataReader.java new file mode 100644 index 0000000000..6b861e55c5 --- /dev/null +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/BloomFilterDataReader.java @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.hadoop; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; + +import org.apache.parquet.Strings; +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.column.values.bloomfilter.BloomFilter; +import org.apache.parquet.column.values.bloomfilter.BloomFilterReader; +import org.apache.parquet.hadoop.metadata.BlockMetaData; +import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; +import org.apache.parquet.io.ParquetDecodingException; + +/** + * A {@link BloomFilterReader} implementation that reads Bloom filter data from + * an open {@link ParquetFileReader}. + * + */ + +public class BloomFilterDataReader implements BloomFilterReader { + private final ParquetFileReader reader; + private final Map columns; + private final Map cache = new HashMap<>(); + + public BloomFilterDataReader(ParquetFileReader fileReader, BlockMetaData block) { + this.reader = fileReader; + this.columns = new HashMap<>(); + for (ColumnChunkMetaData column : block.getColumns()) { + columns.put(column.getPath().toDotString(), column); + } + } + + @Override + public BloomFilter readBloomFilter(ColumnDescriptor descriptor) { + String dotPath = Strings.join(descriptor.getPath(), "."); + ColumnChunkMetaData column = columns.get(dotPath); + if (column == null) { + throw new ParquetDecodingException( + "Cannot load Bloom filter data, unknown column: " + dotPath); + } + + if (cache.containsKey(dotPath)) { + return cache.get(dotPath); + } + + try { + synchronized (cache) { + if (!cache.containsKey(dotPath)) { + BloomFilter bloomFilter = reader.readBloomFilter(column); + if (bloomFilter == null) return null; + cache.put(dotPath, bloomFilter); + } + } + + return cache.get(dotPath); + } catch (IOException e) { + throw new ParquetDecodingException( + "Failed to read Bloom data", e); + } + } +} diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageWriteStore.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageWriteStore.java index 82c288fe43..58b1450dfb 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageWriteStore.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageWriteStore.java @@ -28,6 +28,7 @@ import java.util.Set; import org.apache.parquet.bytes.BytesInput; +import org.apache.parquet.bytes.ByteBufferAllocator; import org.apache.parquet.bytes.ConcatenatingByteArrayCollector; import org.apache.parquet.column.ColumnDescriptor; import org.apache.parquet.column.Encoding; @@ -35,20 +36,22 @@ import org.apache.parquet.column.page.PageWriteStore; import org.apache.parquet.column.page.PageWriter; import org.apache.parquet.column.statistics.Statistics; +import org.apache.parquet.column.values.bloomfilter.BloomFilter; +import org.apache.parquet.column.values.bloomfilter.BloomFilterWriteStore; +import org.apache.parquet.column.values.bloomfilter.BloomFilterWriter; import org.apache.parquet.format.converter.ParquetMetadataConverter; import org.apache.parquet.hadoop.CodecFactory.BytesCompressor; import org.apache.parquet.io.ParquetEncodingException; import org.apache.parquet.schema.MessageType; -import org.apache.parquet.bytes.ByteBufferAllocator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -class ColumnChunkPageWriteStore implements PageWriteStore { +class ColumnChunkPageWriteStore implements PageWriteStore, BloomFilterWriteStore { private static final Logger LOG = LoggerFactory.getLogger(ColumnChunkPageWriteStore.class); private static ParquetMetadataConverter parquetMetadataConverter = new ParquetMetadataConverter(); - private static final class ColumnChunkPageWriter implements PageWriter { + private static final class ColumnChunkPageWriter implements PageWriter, BloomFilterWriter { private final ColumnDescriptor path; private final BytesCompressor compressor; @@ -56,6 +59,7 @@ private static final class ColumnChunkPageWriter implements PageWriter { private final ByteArrayOutputStream tempOutputStream = new ByteArrayOutputStream(); private final ConcatenatingByteArrayCollector buf; private DictionaryPage dictionaryPage; + private BloomFilter bloomFilter; private long uncompressedLength; private long compressedLength; @@ -194,6 +198,10 @@ public long getMemSize() { public void writeToFileWriter(ParquetFileWriter writer) throws IOException { writer.startColumn(path, totalValueCount, compressor.getCodecName()); + if (bloomFilter != null) { + writer.writeBloomFilter(bloomFilter); + } + if (dictionaryPage != null) { writer.writeDictionaryPage(dictionaryPage); // tracking the dictionary encoding is handled in writeDictionaryPage @@ -238,6 +246,11 @@ public String memUsageString(String prefix) { return buf.memUsageString(prefix + " ColumnChunkPageWriter"); } + @Override + public void writeBloomFilter(BloomFilter bloomFilter) { + this.bloomFilter = bloomFilter; + } + } private final Map writers = new HashMap(); @@ -255,6 +268,11 @@ public PageWriter getPageWriter(ColumnDescriptor path) { return writers.get(path); } + @Override + public BloomFilterWriter getBloomFilterWriter(ColumnDescriptor path) { + return writers.get(path); + } + public void flushToFileWriter(ParquetFileWriter writer) throws IOException { for (ColumnDescriptor path : schema.getColumns()) { ColumnChunkPageWriter pageWriter = writers.get(path); diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java index 15fe592dbe..a7d07ba007 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java @@ -31,6 +31,8 @@ import java.io.IOException; import java.io.SequenceInputStream; import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.IntBuffer; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; @@ -54,19 +56,20 @@ import org.apache.parquet.ParquetReadOptions; import org.apache.parquet.bytes.ByteBufferInputStream; +import org.apache.parquet.bytes.BytesInput; +import org.apache.parquet.column.ColumnDescriptor; import org.apache.parquet.column.Encoding; import org.apache.parquet.column.page.DictionaryPageReadStore; import org.apache.parquet.compression.CompressionCodecFactory.BytesInputDecompressor; import org.apache.parquet.filter2.compat.FilterCompat; import org.apache.parquet.filter2.compat.RowGroupFilter; -import org.apache.parquet.bytes.BytesInput; -import org.apache.parquet.column.ColumnDescriptor; import org.apache.parquet.column.page.DataPage; import org.apache.parquet.column.page.DataPageV1; import org.apache.parquet.column.page.DataPageV2; import org.apache.parquet.column.page.DictionaryPage; import org.apache.parquet.column.page.PageReadStore; +import org.apache.parquet.column.values.bloomfilter.BloomFilter; import org.apache.parquet.hadoop.metadata.ColumnPath; import org.apache.parquet.format.DataPageHeader; import org.apache.parquet.format.DataPageHeaderV2; @@ -83,10 +86,10 @@ import org.apache.parquet.hadoop.util.HadoopInputFile; import org.apache.parquet.HadoopReadOptions; import org.apache.parquet.hadoop.util.HiddenFileFilter; -import org.apache.parquet.io.SeekableInputStream; import org.apache.parquet.hadoop.util.counters.BenchmarkCounter; import org.apache.parquet.io.ParquetDecodingException; import org.apache.parquet.io.InputFile; +import org.apache.parquet.io.SeekableInputStream; import org.apache.parquet.schema.MessageType; import org.apache.parquet.schema.PrimitiveType; import org.slf4j.Logger; @@ -859,6 +862,9 @@ public DictionaryPageReader getDictionaryReader(BlockMetaData block) { return new DictionaryPageReader(this, block); } + public BloomFilterDataReader getBloomFilterDataReader(BlockMetaData block) { + return new BloomFilterDataReader(this, block); + } /** * Reads and decompresses a dictionary page for the given column chunk. * @@ -910,6 +916,35 @@ private DictionaryPage readCompressedDictionary( converter.getEncoding(dictHeader.getEncoding())); } + /** + * Reads Bloom filter data for the given column chunk. + * + * @param meta a column's ColumnChunkMetaData to read the dictionary from + * @return an BloomFilter object. + * @throws IOException if there is an error while reading the Bloom filter. + */ + public BloomFilter readBloomFilter(ColumnChunkMetaData meta) throws IOException { + long bloomFilterOffset = meta.getBloomFilterOffset(); + + if (bloomFilterOffset == Long.MAX_VALUE) return null; + f.seek(bloomFilterOffset); + + // Read Bloom filter data header. + byte[] bytes = new byte[BloomFilter.HEADER_SIZE]; + f.read(bytes); + ByteBuffer bloomHeader = ByteBuffer.wrap(bytes); + IntBuffer headerBuffer = bloomHeader.order(ByteOrder.LITTLE_ENDIAN).asIntBuffer(); + int numBytes = headerBuffer.get(); + + BloomFilter.HashStrategy hash = BloomFilter.HashStrategy.values()[headerBuffer.get()]; + BloomFilter.Algorithm algorithm = BloomFilter.Algorithm.values()[headerBuffer.get()]; + + byte[] bitset = new byte[numBytes]; + f.readFully(bitset); + + return new BloomFilter(bitset); + } + @Override public void close() throws IOException { try { diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java index c98c247965..7c52b1b93f 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java @@ -48,6 +48,7 @@ import org.apache.parquet.column.EncodingStats; import org.apache.parquet.column.page.DictionaryPage; import org.apache.parquet.column.statistics.Statistics; +import org.apache.parquet.column.values.bloomfilter.BloomFilter; import org.apache.parquet.hadoop.ParquetOutputFormat.JobSummaryLevel; import org.apache.parquet.hadoop.metadata.ColumnPath; import org.apache.parquet.format.converter.ParquetMetadataConverter; @@ -117,6 +118,7 @@ public static enum Mode { private long currentChunkValueCount; // set in startColumn private long currentChunkFirstDataPage; // set in startColumn (out.pos()) private long currentChunkDictionaryPageOffset; // set in writeDictionaryPage + private long currentChunkBloomFilterDataOffset; // set in writeBloomData // set when end is called private ParquetMetadata footer = null; @@ -348,6 +350,16 @@ public void writeDictionaryPage(DictionaryPage dictionaryPage) throws IOExceptio currentEncodings.add(dictionaryPage.getEncoding()); } + /** + * Write a Bloom filter + * @param bloomFilter the bloom filter of column values + * @throws IOException if there is an error while writing + */ + public void writeBloomFilter(BloomFilter bloomFilter) throws IOException { + state = state.write(); + currentChunkBloomFilterDataOffset = out.getPos(); + bloomFilter.writeTo(out); + } /** * writes a single page @@ -484,6 +496,7 @@ public void endColumn() throws IOException { currentStatistics, currentChunkFirstDataPage, currentChunkDictionaryPageOffset, + currentChunkBloomFilterDataOffset, currentChunkValueCount, compressedLength, uncompressedLength)); @@ -622,6 +635,7 @@ public void appendRowGroup(SeekableInputStream from, BlockMetaData rowGroup, chunk.getStatistics(), newChunkStart, newChunkStart, + chunk.getBloomFilterOffset(), chunk.getValueCount(), chunk.getTotalSize(), chunk.getTotalUncompressedSize())); diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetInputFormat.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetInputFormat.java index 2c21e52035..3348ed8eb2 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetInputFormat.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetInputFormat.java @@ -129,6 +129,12 @@ public class ParquetInputFormat extends FileInputFormat { */ public static final String DICTIONARY_FILTERING_ENABLED = "parquet.filter.dictionary.enabled"; + /** + * key to configure whether row group bloom filtering is enabled + */ + public static final String BLOOM_FILTERING_ENABLED = "parquet.filter.bloom.enabled"; + public static final boolean BLOOM_FILTER_ENABLED_DEFAULT = false; + /** * key to turn on or off task side metadata loading (default true) * if true then metadata is read on the task side and some tasks may finish immediately. diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetOutputFormat.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetOutputFormat.java index ff5bab397d..6e191b005c 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetOutputFormat.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetOutputFormat.java @@ -143,6 +143,9 @@ public static enum JobSummaryLevel { public static final String MIN_ROW_COUNT_FOR_PAGE_SIZE_CHECK = "parquet.page.size.row.check.min"; public static final String MAX_ROW_COUNT_FOR_PAGE_SIZE_CHECK = "parquet.page.size.row.check.max"; public static final String ESTIMATE_PAGE_SIZE_CHECK = "parquet.page.size.check.estimate"; + public static final String BLOOM_FILTER_COLUMN_NAMES = "parquet.bloom.filter.column.names"; + public static final String BLOOM_FILTER_SIZES = "parquet.bloom.filter.size"; + public static final String ENABLE_BLOOM_FILTER = "parquet.enable.bloom.filter"; public static JobSummaryLevel getJobSummaryLevel(Configuration conf) { String level = conf.get(JOB_SUMMARY_LEVEL); @@ -208,6 +211,14 @@ public static boolean getEnableDictionary(JobContext jobContext) { return getEnableDictionary(getConfiguration(jobContext)); } + public static void setBloomFilterColumnNames(Job job, String names) { + getConfiguration(job).set(BLOOM_FILTER_COLUMN_NAMES, names); + } + + public static String getBloomFilterColumnNames(JobContext jobContext) { + return getBloomFilterColumnNames(getConfiguration(jobContext)); + } + public static int getBlockSize(JobContext jobContext) { return getBlockSize(getConfiguration(jobContext)); } @@ -241,6 +252,19 @@ public static boolean getEnableDictionary(Configuration configuration) { ENABLE_DICTIONARY, ParquetProperties.DEFAULT_IS_DICTIONARY_ENABLED); } + public static String getBloomFilterColumnNames(Configuration conf) { + return conf.get(BLOOM_FILTER_COLUMN_NAMES); + } + + public static boolean getEnableBloomFilter(Configuration configuration) { + return configuration.getBoolean(ENABLE_BLOOM_FILTER, + ParquetProperties.DEFAULT_BLOOM_FILTER_ENABLED); + } + + public static String getBloomFilterSizes(Configuration configuration) { + return configuration.get(BLOOM_FILTER_SIZES); + } + public static int getMinRowCountForPageSizeCheck(Configuration configuration) { return configuration.getInt(MIN_ROW_COUNT_FOR_PAGE_SIZE_CHECK, ParquetProperties.DEFAULT_MINIMUM_RECORD_COUNT_FOR_CHECK); @@ -361,6 +385,8 @@ public RecordWriter getRecordWriter(Configuration conf, Path file, Comp ParquetProperties props = ParquetProperties.builder() .withPageSize(getPageSize(conf)) .withDictionaryPageSize(getDictionaryPageSize(conf)) + .withBloomFilterEnabled(getEnableBloomFilter(conf)) + .withBloomFilterInfo(getBloomFilterColumnNames(conf), getBloomFilterSizes(conf)) .withDictionaryEncoding(getEnableDictionary(conf)) .withWriterVersion(getWriterVersion(conf)) .estimateRowCountForPageSizeCheck(getEstimatePageSizeCheck(conf)) @@ -383,6 +409,9 @@ public RecordWriter getRecordWriter(Configuration conf, Path file, Comp LOG.info("Page size checking is: {}", (props.estimateNextSizeCheck() ? "estimated" : "constant")); LOG.info("Min row count for page size check is: {}", props.getMinRowCountForPageSizeCheck()); LOG.info("Max row count for page size check is: {}", props.getMaxRowCountForPageSizeCheck()); + LOG.info("Parquet Bloom Filter is {}", props.isBloomFilterEnabled()? "on": "off"); + LOG.info("Parquet Bloom filter column names are: {}", props.getBloomFilterInfo().keySet()); + LOG.info("Parquet Bloom filter column sizes are: {}", props.getBloomFilterInfo().values()); } WriteContext init = writeSupport.init(conf); diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ColumnChunkMetaData.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ColumnChunkMetaData.java index fb94247ed7..9f476f6d07 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ColumnChunkMetaData.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ColumnChunkMetaData.java @@ -124,6 +124,7 @@ && positiveLongFitsInAnInt(totalUncompressedSize)) { statistics, firstDataPage, dictionaryPageOffset, + Long.MAX_VALUE, valueCount, totalSize, totalUncompressedSize); @@ -134,6 +135,50 @@ && positiveLongFitsInAnInt(totalUncompressedSize)) { statistics, firstDataPage, dictionaryPageOffset, + Long.MAX_VALUE, + valueCount, + totalSize, + totalUncompressedSize); + } + } + + public static ColumnChunkMetaData get( + ColumnPath path, + PrimitiveType type, + CompressionCodecName codec, + EncodingStats encodingStats, + Set encodings, + Statistics statistics, + long firstDataPage, + long dictionaryPageOffset, + long bloomFilterDataOffset, + long valueCount, + long totalSize, + long totalUncompressedSize) { + // to save space we store those always positive longs in ints when they fit. + if (positiveLongFitsInAnInt(firstDataPage) + && positiveLongFitsInAnInt(dictionaryPageOffset) + && positiveLongFitsInAnInt(valueCount) + && positiveLongFitsInAnInt(totalSize) + && positiveLongFitsInAnInt(totalUncompressedSize)) { + return new IntColumnChunkMetaData( + path, type, codec, + encodingStats, encodings, + statistics, + firstDataPage, + dictionaryPageOffset, + bloomFilterDataOffset, + valueCount, + totalSize, + totalUncompressedSize); + } else { + return new LongColumnChunkMetaData( + path, type, codec, + encodingStats, encodings, + statistics, + firstDataPage, + dictionaryPageOffset, + bloomFilterDataOffset, valueCount, totalSize, totalUncompressedSize); @@ -217,6 +262,11 @@ public PrimitiveType getPrimitiveType() { */ abstract public long getDictionaryPageOffset(); + /** + * @return the location of the bloomFilter filter data if any + */ + abstract public long getBloomFilterOffset(); + /** * @return count of values in this block of the column */ @@ -258,6 +308,7 @@ class IntColumnChunkMetaData extends ColumnChunkMetaData { private final int firstDataPage; private final int dictionaryPageOffset; + private final int bloomFilterDataOffset; private final int valueCount; private final int totalSize; private final int totalUncompressedSize; @@ -284,12 +335,14 @@ class IntColumnChunkMetaData extends ColumnChunkMetaData { Statistics statistics, long firstDataPage, long dictionaryPageOffset, + long bloomFilterDataOffset, long valueCount, long totalSize, long totalUncompressedSize) { super(encodingStats, ColumnChunkProperties.get(path, type, codec, encodings)); this.firstDataPage = positiveLongToInt(firstDataPage); this.dictionaryPageOffset = positiveLongToInt(dictionaryPageOffset); + this.bloomFilterDataOffset = positiveLongToInt(bloomFilterDataOffset); this.valueCount = positiveLongToInt(valueCount); this.totalSize = positiveLongToInt(totalSize); this.totalUncompressedSize = positiveLongToInt(totalUncompressedSize); @@ -331,6 +384,13 @@ public long getDictionaryPageOffset() { return intToPositiveLong(dictionaryPageOffset); } + /** + * @return the location of bloom filter if any + */ + public long getBloomFilterOffset() { + return intToPositiveLong(bloomFilterDataOffset); + } + /** * @return count of values in this block of the column */ @@ -363,6 +423,7 @@ class LongColumnChunkMetaData extends ColumnChunkMetaData { private final long firstDataPageOffset; private final long dictionaryPageOffset; + private final long bloomFilterDataOffset; private final long valueCount; private final long totalSize; private final long totalUncompressedSize; @@ -389,12 +450,14 @@ class LongColumnChunkMetaData extends ColumnChunkMetaData { Statistics statistics, long firstDataPageOffset, long dictionaryPageOffset, + long bloomFilterDataOffset, long valueCount, long totalSize, long totalUncompressedSize) { super(encodingStats, ColumnChunkProperties.get(path, type, codec, encodings)); this.firstDataPageOffset = firstDataPageOffset; this.dictionaryPageOffset = dictionaryPageOffset; + this.bloomFilterDataOffset = bloomFilterDataOffset; this.valueCount = valueCount; this.totalSize = totalSize; this.totalUncompressedSize = totalUncompressedSize; @@ -415,6 +478,13 @@ public long getDictionaryPageOffset() { return dictionaryPageOffset; } + /** + * @return the location of the bloom filter if any + */ + public long getBloomFilterOffset() { + return bloomFilterDataOffset; + } + /** * @return count of values in this block of the column */ diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java index 095b575c80..636515dc8f 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java @@ -39,6 +39,7 @@ import org.apache.parquet.column.page.PageReader; import org.apache.parquet.column.statistics.BinaryStatistics; import org.apache.parquet.column.statistics.LongStatistics; +import org.apache.parquet.column.values.bloomfilter.*; import org.apache.parquet.format.Statistics; import org.apache.parquet.hadoop.metadata.*; import org.apache.parquet.hadoop.util.HiddenFileFilter; @@ -132,6 +133,44 @@ public void testWriteMode() throws Exception { testFile.delete(); } + @Test + public void testBloomWriteRead() throws Exception { + MessageType schema = MessageTypeParser.parseMessageType("message test { required binary foo; }"); + File testFile = temp.newFile(); + testFile.delete(); + + Path path = new Path(testFile.toURI()); + Configuration configuration = new Configuration(); + configuration.set("parquet.bloomFilter.filter.column.names", "foo"); + String colPath[] = {"foo"}; + ColumnDescriptor col = schema.getColumnDescription(colPath); + + BinaryStatistics stats1 = new BinaryStatistics(); + + ParquetFileWriter w = new ParquetFileWriter(configuration, schema, path); + w.start(); + w.startBlock(3); + w.startColumn(col, 5, CODEC); + w.writeDataPage(2, 4, BytesInput.from(BYTES1),stats1, BIT_PACKED, BIT_PACKED, PLAIN); + w.writeDataPage(3, 4, BytesInput.from(BYTES1),stats1, BIT_PACKED, BIT_PACKED, PLAIN); + BloomFilter bloomData = new BloomFilter(0); + bloomData.insert(bloomData.hash(Binary.fromString("hello"))); + bloomData.insert(bloomData.hash(Binary.fromString("world"))); + long blStarts = w.getPos(); + w.writeBloomFilter(bloomData); + w.endColumn(); + w.endBlock(); + w.end(new HashMap()); + ParquetMetadata readFooter = ParquetFileReader.readFooter(configuration, path); + assertEquals("bloomFilter offset", blStarts, readFooter.getBlocks().get(0).getColumns().get(0).getBloomFilterOffset()); + ParquetFileReader r = new ParquetFileReader(configuration, readFooter.getFileMetaData(), path, + Arrays.asList(readFooter.getBlocks().get(0)), Arrays.asList(schema.getColumnDescription(colPath))); + BloomFilterReader bloomFilterReader = r.getBloomFilterDataReader(readFooter.getBlocks().get(0)); + BloomFilter bloomDataRead = bloomFilterReader.readBloomFilter(col); + assertTrue(bloomDataRead.find(bloomData.hash(Binary.fromString("hello")))); + assertTrue(bloomDataRead.find(bloomData.hash(Binary.fromString("world")))); + } + @Test public void testWriteRead() throws Exception { File testFile = temp.newFile(); diff --git a/pom.xml b/pom.xml index 7b3f36fe5b..ee8ae94fb7 100644 --- a/pom.xml +++ b/pom.xml @@ -96,7 +96,7 @@ 0.9.33 1.7.22 1.8.2 - 20.0 + 24.0-jre 0.1.1 1.10.19 From 1a0875beddd46d2d226fd1e9b4e1f356a1f5212a Mon Sep 17 00:00:00 2001 From: Junjie Chen Date: Sun, 21 Oct 2018 01:04:41 +0800 Subject: [PATCH 2/9] Align to parquet-cpp side code and address comments --- .../apache/parquet/cli/util/Expressions.java | 4 +- .../parquet/column/ParquetProperties.java | 38 +-- .../column/impl/ColumnWriteStoreV1.java | 8 +- .../column/impl/ColumnWriteStoreV2.java | 8 +- .../parquet/column/impl/ColumnWriterV1.java | 80 +++-- .../parquet/column/impl/ColumnWriterV2.java | 32 +- .../bloomfilter/BlockSplitBloomFilter.java | 318 ++++++++++++++++++ .../values/bloomfilter/BloomFilter.java | 296 +--------------- .../TestBlockSplitBloomFilter.java | 129 +++++++ .../hadoop/ColumnChunkPageWriteStore.java | 1 + .../parquet/hadoop/ParquetFileReader.java | 5 +- .../parquet/hadoop/ParquetOutputFormat.java | 18 +- .../parquet/hadoop/TestParquetFileWriter.java | 8 +- pom.xml | 4 +- 14 files changed, 583 insertions(+), 366 deletions(-) create mode 100644 parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BlockSplitBloomFilter.java create mode 100644 parquet-column/src/test/java/org/apache/parquet/column/values/bloomfilter/TestBlockSplitBloomFilter.java diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/util/Expressions.java b/parquet-cli/src/main/java/org/apache/parquet/cli/util/Expressions.java index d18ef559f2..06b28b46ae 100644 --- a/parquet-cli/src/main/java/org/apache/parquet/cli/util/Expressions.java +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/util/Expressions.java @@ -19,7 +19,7 @@ package org.apache.parquet.cli.util; -import com.google.common.base.MoreObjects; +import com.google.common.base.Objects; import com.google.common.base.Preconditions; import com.google.common.collect.Lists; import org.apache.avro.Schema; @@ -385,7 +385,7 @@ public int hashCode() { @Override public String toString() { - return MoreObjects.toStringHelper(this) + return Objects.toStringHelper(this) .add("type", type) .add("value", value) .add("children", children) diff --git a/parquet-column/src/main/java/org/apache/parquet/column/ParquetProperties.java b/parquet-column/src/main/java/org/apache/parquet/column/ParquetProperties.java index 94f1978f68..f01888aed8 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/ParquetProperties.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/ParquetProperties.java @@ -1,4 +1,4 @@ -/* +/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -6,9 +6,9 @@ * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -87,11 +87,11 @@ public static WriterVersion fromString(String name) { private final ByteBufferAllocator allocator; private final ValuesWriterFactory valuesWriterFactory; private final boolean enableBloomFilter; - private final HashMap bloomFilterInfo; + private final HashMap bloomFilterExpectValues; private ParquetProperties(WriterVersion writerVersion, int pageSize, int dictPageSize, boolean enableDict, int minRowCountForPageSizeCheck, int maxRowCountForPageSizeCheck, boolean estimateNextSizeCheck, ByteBufferAllocator allocator, - ValuesWriterFactory writerFactory, boolean enableBloomFilter, HashMap bloomFilterInfo) { + ValuesWriterFactory writerFactory, boolean enableBloomFilter, HashMap bloomFilterExpectValues) { this.pageSizeThreshold = pageSize; this.initialSlabSize = CapacityByteArrayOutputStream .initialSlabSizeHeuristic(MIN_SLAB_SIZE, pageSizeThreshold, 10); @@ -103,7 +103,7 @@ private ParquetProperties(WriterVersion writerVersion, int pageSize, int dictPag this.estimateNextSizeCheck = estimateNextSizeCheck; this.allocator = allocator; this.enableBloomFilter = enableBloomFilter; - this.bloomFilterInfo = bloomFilterInfo; + this.bloomFilterExpectValues = bloomFilterExpectValues; this.valuesWriterFactory = writerFactory; } @@ -169,8 +169,8 @@ public boolean isBloomFilterEnabled() { return enableBloomFilter; } - public HashMap getBloomFilterInfo() { - return bloomFilterInfo; + public HashMap getBloomFilterExpectValues() { + return bloomFilterExpectValues; } public ColumnWriteStore newColumnWriteStore(MessageType schema, @@ -214,7 +214,7 @@ public static class Builder { private int dictPageSize = DEFAULT_DICTIONARY_PAGE_SIZE; private boolean enableDict = DEFAULT_IS_DICTIONARY_ENABLED; private boolean enableBloomFilter = DEFAULT_BLOOM_FILTER_ENABLED; - private HashMap bloomFilterInfo = new HashMap<>(); + private HashMap bloomFilterExpectValues = new HashMap<>(); private WriterVersion writerVersion = DEFAULT_WRITER_VERSION; private int minRowCountForPageSizeCheck = DEFAULT_MINIMUM_RECORD_COUNT_FOR_CHECK; private int maxRowCountForPageSizeCheck = DEFAULT_MAXIMUM_RECORD_COUNT_FOR_CHECK; @@ -234,7 +234,7 @@ private Builder(ParquetProperties toCopy) { this.estimateNextSizeCheck = toCopy.estimateNextSizeCheck; this.allocator = toCopy.allocator; this.enableBloomFilter = toCopy.enableBloomFilter; - this.bloomFilterInfo = toCopy.bloomFilterInfo; + this.bloomFilterExpectValues = toCopy.bloomFilterExpectValues; } /** @@ -288,19 +288,19 @@ public Builder withBloomFilterEnabled(boolean enableBloomFilter) { /** * Set Bloom filter info for columns. * - * @param names the columns to be enable for Bloom filter - * @param sizes the sizes corresponding to columns + * @param bloomFilterColumnNames the columns to be enabled for Bloom filter + * @param bloomFilterDistinctNumbers the expected distinct number of values corresponding to columns * @return this builder for method chaining */ - public Builder withBloomFilterInfo(String names, String sizes) { - String[] bloomFilterColumns = names.split(","); - String[] bloomFilterSizes = sizes.split(","); + public Builder withBloomFilterInfo(String bloomFilterColumnNames, String bloomFilterDistinctNumbers) { + String[] columnNames = bloomFilterColumnNames.split(","); + String[] expectedDistinctNumber = bloomFilterDistinctNumbers.split(","); - Preconditions.checkArgument(bloomFilterColumns.length == bloomFilterSizes.length, + Preconditions.checkArgument(columnNames.length == expectedDistinctNumber.length, "Column names are not matched to sizes"); - for (int i = 0; i < bloomFilterColumns.length; i++) { - bloomFilterInfo.put(bloomFilterColumns[i], Long.getLong(bloomFilterSizes[i])); + for (int i = 0; i < columnNames.length; i++) { + this.bloomFilterExpectValues.put(columnNames[i], Long.getLong(expectedDistinctNumber[i])); } return this; @@ -354,7 +354,7 @@ public ParquetProperties build() { new ParquetProperties(writerVersion, pageSize, dictPageSize, enableDict, minRowCountForPageSizeCheck, maxRowCountForPageSizeCheck, estimateNextSizeCheck, allocator, valuesWriterFactory, - enableBloomFilter, bloomFilterInfo); + enableBloomFilter, bloomFilterExpectValues); // we pass a constructed but uninitialized factory to ParquetProperties above as currently // creation of ValuesWriters is invoked from within ParquetProperties. In the future // we'd like to decouple that and won't need to pass an object to properties and then pass the diff --git a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriteStoreV1.java b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriteStoreV1.java index bd401430ad..7e2876077a 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriteStoreV1.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriteStoreV1.java @@ -1,4 +1,4 @@ -/* +/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -6,9 +6,9 @@ * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -70,7 +70,7 @@ public Set getColumnDescriptors() { private ColumnWriterV1 newMemColumn(ColumnDescriptor path) { PageWriter pageWriter = pageWriteStore.getPageWriter(path); - if (props.isBloomFilterEnabled() && props.getBloomFilterInfo() != null) { + if (props.isBloomFilterEnabled() && props.getBloomFilterExpectValues() != null) { BloomFilterWriter bloomFilterWriter = bloomFilterWriteStore.getBloomFilterWriter(path); return new ColumnWriterV1(path, pageWriter, bloomFilterWriter, props); } else { diff --git a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriteStoreV2.java b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriteStoreV2.java index 057660ff5d..6c20b8bb87 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriteStoreV2.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriteStoreV2.java @@ -1,4 +1,4 @@ -/* +/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -6,9 +6,9 @@ * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -79,7 +79,7 @@ public ColumnWriteStoreV2( for (ColumnDescriptor path : schema.getColumns()) { PageWriter pageWriter = pageWriteStore.getPageWriter(path); - if (props.isBloomFilterEnabled() && props.getBloomFilterInfo() != null) { + if (props.isBloomFilterEnabled() && props.getBloomFilterExpectValues() != null) { BloomFilterWriter bloomFilterWriter = bloomFilterWriteStore.getBloomFilterWriter(path); mcolumns.put(path, new ColumnWriterV2(path, pageWriter, bloomFilterWriter, props)); } else { diff --git a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV1.java b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV1.java index daf51cfa0d..c5fc9dc549 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV1.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV1.java @@ -1,4 +1,4 @@ -/* +/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -6,9 +6,9 @@ * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -30,6 +30,7 @@ import org.apache.parquet.column.page.PageWriter; import org.apache.parquet.column.statistics.Statistics; import org.apache.parquet.column.values.ValuesWriter; +import org.apache.parquet.column.values.bloomfilter.BlockSplitBloomFilter; import org.apache.parquet.column.values.bloomfilter.BloomFilter; import org.apache.parquet.column.values.bloomfilter.BloomFilterWriter; import org.apache.parquet.io.ParquetEncodingException; @@ -66,14 +67,19 @@ public ColumnWriterV1(ColumnDescriptor path, PageWriter pageWriter, this(path, pageWriter, props); // Current not support nested column. - if (path.getPath().length == 1) { - this.bloomFilterWriter = bloomFilterWriter; - HashMap bloomFilterInfo = props.getBloomFilterInfo(); - String column = path.getPath()[0]; - if (bloomFilterInfo.keySet().contains(column)) { - this.bloomFilter = new BloomFilter(bloomFilterInfo.get(column).intValue()); - } + if (path.getPath().length != 1 || bloomFilterWriter == null) { + return; + } + + this.bloomFilterWriter = bloomFilterWriter; + HashMap bloomFilterExpectValues = props.getBloomFilterExpectValues(); + String column = path.getPath()[0]; + if (bloomFilterExpectValues.keySet().contains(column)) { + int optimalNumOfBits = BlockSplitBloomFilter.optimalNumOfBits(bloomFilterExpectValues.get(column).intValue(), + BlockSplitBloomFilter.DEFAULT_FPP); + this.bloomFilter = new BlockSplitBloomFilter(optimalNumOfBits/8); } + } public ColumnWriterV1(ColumnDescriptor path, PageWriter pageWriter, @@ -161,6 +167,36 @@ private void updateStatistics(boolean value) { statistics.updateStats(value); } + private void updateBloomFilter(int value) { + if (bloomFilter != null) { + bloomFilter.insert(bloomFilter.hash(value)); + } + } + + private void updateBloomFilter(long value) { + if (bloomFilter != null) { + bloomFilter.insert(bloomFilter.hash(value)); + } + } + + private void updateBloomFilter(double value) { + if (bloomFilter != null) { + bloomFilter.insert(bloomFilter.hash(value)); + } + } + + private void updateBloomFilter(float value) { + if (bloomFilter != null) { + bloomFilter.insert(bloomFilter.hash(value)); + } + } + + private void updateBloomFilter(Binary value) { + if (bloomFilter != null) { + bloomFilter.insert(bloomFilter.hash(value)); + } + } + private void writePage() { if (DEBUG) LOG.debug("write page"); try { @@ -197,9 +233,7 @@ public void write(double value, int repetitionLevel, int definitionLevel) { definitionLevelColumn.writeInteger(definitionLevel); dataColumn.writeDouble(value); updateStatistics(value); - if (bloomFilter != null) { - bloomFilter.insert(bloomFilter.hash(value)); - } + updateBloomFilter(value); accountForValueWritten(); } @@ -210,9 +244,7 @@ public void write(float value, int repetitionLevel, int definitionLevel) { definitionLevelColumn.writeInteger(definitionLevel); dataColumn.writeFloat(value); updateStatistics(value); - if (bloomFilter != null) { - bloomFilter.insert(bloomFilter.hash(value)); - } + updateBloomFilter(value); accountForValueWritten(); } @@ -223,9 +255,7 @@ public void write(Binary value, int repetitionLevel, int definitionLevel) { definitionLevelColumn.writeInteger(definitionLevel); dataColumn.writeBytes(value); updateStatistics(value); - if (bloomFilter != null) { - bloomFilter.insert(bloomFilter.hash(value)); - } + updateBloomFilter(value); accountForValueWritten(); } @@ -246,9 +276,7 @@ public void write(int value, int repetitionLevel, int definitionLevel) { definitionLevelColumn.writeInteger(definitionLevel); dataColumn.writeInteger(value); updateStatistics(value); - if (bloomFilter != null) { - bloomFilter.insert(bloomFilter.hash(value)); - } + updateBloomFilter(value); accountForValueWritten(); } @@ -259,9 +287,7 @@ public void write(long value, int repetitionLevel, int definitionLevel) { definitionLevelColumn.writeInteger(definitionLevel); dataColumn.writeLong(value); updateStatistics(value); - if (bloomFilter != null) { - bloomFilter.insert(bloomFilter.hash(value)); - } + updateBloomFilter(value); accountForValueWritten(); } @@ -296,7 +322,7 @@ public void close() { @Override public long getBufferedSizeInMemory() { - long bloomBufferSize = bloomFilter == null ? 0 : bloomFilter.getBufferedSize(); + long bloomBufferSize = bloomFilter == null ? 0 : bloomFilter.getBitsetSize(); return repetitionLevelColumn.getBufferedSize() + definitionLevelColumn.getBufferedSize() + dataColumn.getBufferedSize() @@ -305,7 +331,7 @@ public long getBufferedSizeInMemory() { } public long allocatedSize() { - long bloomAllocatedSize = bloomFilter == null ? 0 : bloomFilter.getBufferedSize(); + long bloomAllocatedSize = bloomFilter == null ? 0 : bloomFilter.getBitsetSize(); return repetitionLevelColumn.getAllocatedSize() + definitionLevelColumn.getAllocatedSize() + dataColumn.getAllocatedSize() diff --git a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV2.java b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV2.java index e041e97b57..7b1671407a 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV2.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV2.java @@ -1,4 +1,4 @@ -/* +/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -6,9 +6,9 @@ * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -31,6 +31,7 @@ import org.apache.parquet.column.page.PageWriter; import org.apache.parquet.column.statistics.Statistics; import org.apache.parquet.column.values.ValuesWriter; +import org.apache.parquet.column.values.bloomfilter.BlockSplitBloomFilter; import org.apache.parquet.column.values.bloomfilter.BloomFilter; import org.apache.parquet.column.values.bloomfilter.BloomFilterWriter; import org.apache.parquet.column.values.rle.RunLengthBitPackingHybridEncoder; @@ -82,15 +83,18 @@ public ColumnWriterV2( ParquetProperties props) { this(path, pageWriter, props); - this.bloomFilterWriter = bloomFilterWriter; - HashMap bloomFilterInfo = props.getBloomFilterInfo(); - // Current not support nested column. - if (path.getPath().length == 1) { - String column = path.getPath()[0]; - if (bloomFilterInfo.keySet().contains(column)) { - this.bloomFilter = new BloomFilter(bloomFilterInfo.get(column).intValue()); - } + if (path.getPath().length != 1 || bloomFilterWriter == null) { + return; + } + + this.bloomFilterWriter = bloomFilterWriter; + HashMap bloomFilterExpectValues = props.getBloomFilterExpectValues(); + String column = path.getPath()[0]; + if (bloomFilterExpectValues.keySet().contains(column)) { + int optimalNumOfBits = BlockSplitBloomFilter.optimalNumOfBits(bloomFilterExpectValues.get(column).intValue(), + BlockSplitBloomFilter.DEFAULT_FPP); + this.bloomFilter = new BlockSplitBloomFilter(optimalNumOfBits/8); } } @@ -278,7 +282,7 @@ public void finalizeColumnChunk() { * @return the number of bytes of memory used to buffer the current data */ public long getCurrentPageBufferedSize() { - long bloomBufferSize = bloomFilter == null ? 0 : bloomFilter.getBufferedSize(); + long bloomBufferSize = bloomFilter == null ? 0 : bloomFilter.getBitsetSize(); return repetitionLevelColumn.getBufferedSize() + definitionLevelColumn.getBufferedSize() + dataColumn.getBufferedSize() @@ -290,7 +294,7 @@ public long getCurrentPageBufferedSize() { * @return the number of bytes of memory used to buffer the current data and the previously written pages */ public long getTotalBufferedSize() { - long bloomBufferSize = bloomFilter == null ? 0 : bloomFilter.getBufferedSize(); + long bloomBufferSize = bloomFilter == null ? 0 : bloomFilter.getBitsetSize(); return repetitionLevelColumn.getBufferedSize() + definitionLevelColumn.getBufferedSize() + dataColumn.getBufferedSize() @@ -302,7 +306,7 @@ public long getTotalBufferedSize() { * @return actual memory used */ public long allocatedSize() { - long bloomFilterSize = bloomFilter == null ? 0 : bloomFilter.getBufferedSize(); + long bloomFilterSize = bloomFilter == null ? 0 : bloomFilter.getBitsetSize(); return repetitionLevelColumn.getAllocatedSize() + definitionLevelColumn.getAllocatedSize() + dataColumn.getAllocatedSize() diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BlockSplitBloomFilter.java b/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BlockSplitBloomFilter.java new file mode 100644 index 0000000000..d2cf4d692c --- /dev/null +++ b/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BlockSplitBloomFilter.java @@ -0,0 +1,318 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.column.values.bloomfilter; + +import com.google.common.hash.HashFunction; +import com.google.common.hash.Hashing; +import org.apache.parquet.Preconditions; +import org.apache.parquet.bytes.BytesUtils; +import org.apache.parquet.io.api.Binary; + +import java.io.IOException; +import java.io.OutputStream; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.IntBuffer; + +/* + * This Bloom filter is implemented using block-based Bloom filter algorithm from Putze et al.'s + * "Cache-, Hash- and Space-Efficient Bloom filters". The basic idea is to hash the item to a tiny + * Bloom filter which size fit a single cache line or smaller. This implementation sets 8 bits in + * each tiny Bloom filter. Each tiny Bloom filter is 32 bytes to take advantage of 32-byte SIMD + * instruction. + */ +public class BlockSplitBloomFilter extends BloomFilter { + // Bytes in a tiny Bloom filter block. + private static final int BYTES_PER_FILTER_BLOCK = 32; + + // Default seed for hash function, it comes from System.nanoTime(). + private static final int DEFAULT_SEED = 1361930890; + + // Minimum Bloom filter size, set to size of a tiny Bloom filter block + public static final int MINIMUM_BLOOM_FILTER_BYTES = 32; + + // Maximum Bloom filter size, it sets to default HDFS block size for upper boundary check + // This should be re-consider when implementing write side logic. + public static final int MAXIMUM_BLOOM_FILTER_BYTES = 128 * 1024 * 1024; + + // The number of bits to set in a tiny Bloom filter + private static final int BITS_SET_PER_BLOCK = 8; + + // The header of Bloom filter, it includes number of bytes, algorithm and hash enumeration. + public static final int HEADER_SIZE = 12; + + // The default false positive probability value + public static final double DEFAULT_FPP = 0.01; + + // Hash strategy used in this Bloom filter. + public final HashStrategy hashStrategy; + + // Algorithm used in this Bloom filter. + public final Algorithm algorithm; + + // The underlying byte array for Bloom filter bitset. + private byte[] bitset; + + // A integer array buffer of underlying bitset to help setting bits. + private IntBuffer intBuffer; + + // Hash function use to compute hash for column value. + private HashFunction hashFunction; + + // The block-based algorithm needs 8 odd SALT values to calculate eight index + // of bit to set, one bit in 32-bit word. + private static final int SALT[] = {0x47b6137b, 0x44974d91, 0x8824ad5b, 0xa2b7289d, + 0x705495c7, 0x2df1424b, 0x9efc4947, 0x5c6bfb31}; + + /** + * Constructor of Bloom filter. + * + * @param numBytes The number of bytes for Bloom filter bitset. The range of num_bytes should be within + * [MINIMUM_BLOOM_FILTER_BYTES, MAXIMUM_BLOOM_FILTER_BYTES], it will be rounded up/down + * to lower/upper bound if num_bytes is out of range and also will rounded up to a power + * of 2. It uses murmur3_x64_128 as its default hash function and block-based algorithm + * as default algorithm. + */ + public BlockSplitBloomFilter(int numBytes) { + this(numBytes, HashStrategy.MURMUR3_X64_128, Algorithm.BLOCK); + } + + /** + * Constructor of Bloom filter. It uses murmur3_x64_128 as its default hash + * function and block-based algorithm as its default algorithm. + * + * @param numBytes The number of bytes for Bloom filter bitset + * @param hashStrategy The hash strategy of Bloom filter. + * @param algorithm The algorithm of Bloom filter. + */ + private BlockSplitBloomFilter(int numBytes, HashStrategy hashStrategy, Algorithm algorithm) { + initBitset(numBytes); + + switch (hashStrategy) { + case MURMUR3_X64_128: + this.hashStrategy = hashStrategy; + hashFunction = Hashing.murmur3_128(DEFAULT_SEED); + break; + default: + throw new RuntimeException("Not supported hash strategy"); + } + + this.algorithm = algorithm; + } + + + /** + * Construct the Bloom filter with given bitset, it is used when reconstructing + * Bloom filter from parquet file. It use murmur3_x64_128 as its default hash + * function and block-based algorithm as default algorithm. + * + * @param bitset The given bitset to construct Bloom filter. + */ + public BlockSplitBloomFilter(byte[] bitset) { + this(bitset, HashStrategy.MURMUR3_X64_128, Algorithm.BLOCK); + } + + /** + * Construct the Bloom filter with given bitset, it is used when reconstructing + * Bloom filter from parquet file. + * + * @param bitset The given bitset to construct Bloom filter. + * @param hashStrategy The hash strategy Bloom filter apply. + * @param algorithm The algorithm of Bloom filter. + */ + private BlockSplitBloomFilter(byte[] bitset, HashStrategy hashStrategy, Algorithm algorithm) { + if (bitset == null) { + throw new RuntimeException("Given bitset is null"); + } + this.bitset = bitset; + this.intBuffer = ByteBuffer.wrap(bitset).order(ByteOrder.LITTLE_ENDIAN).asIntBuffer(); + + switch (hashStrategy) { + case MURMUR3_X64_128: + this.hashStrategy = hashStrategy; + hashFunction = Hashing.murmur3_128(DEFAULT_SEED); + break; + default: + throw new RuntimeException("Not supported hash strategy"); + } + this.algorithm = algorithm; + } + + /** + * Create a new bitset for Bloom filter. + * + * @param numBytes The number of bytes for Bloom filter bitset. The range of num_bytes should be within + * [MINIMUM_BLOOM_FILTER_BYTES, MAXIMUM_BLOOM_FILTER_BYTES], it will be rounded up/down + * to lower/upper bound if num_bytes is out of range and also will rounded up to a power + * of 2. It uses murmur3_x64_128 as its default hash function and block-based algorithm + * as default algorithm. + */ + private void initBitset(int numBytes) { + if (numBytes < MINIMUM_BLOOM_FILTER_BYTES) { + numBytes = MINIMUM_BLOOM_FILTER_BYTES; + } + + // Get next power of 2 if it is not power of 2. + if ((numBytes & (numBytes - 1)) != 0) { + numBytes = Integer.highestOneBit(numBytes) << 1; + } + + if (numBytes > MAXIMUM_BLOOM_FILTER_BYTES || numBytes < 0) { + numBytes = MAXIMUM_BLOOM_FILTER_BYTES; + } + + this.bitset = new byte[numBytes]; + this.intBuffer = ByteBuffer.wrap(bitset).order(ByteOrder.LITTLE_ENDIAN).asIntBuffer(); + } + + @Override + public void writeTo(OutputStream out) throws IOException { + // Write number of bytes of bitset. + out.write(BytesUtils.intToBytes(bitset.length)); + + // Write hash strategy + out.write(BytesUtils.intToBytes(this.hashStrategy.ordinal())); + + // Write algorithm + out.write(BytesUtils.intToBytes(this.algorithm.ordinal())); + + // Write bitset + out.write(bitset); + } + + private int[] setMask(int key) { + int mask[] = new int[BITS_SET_PER_BLOCK]; + + for (int i = 0; i < BITS_SET_PER_BLOCK; ++i) { + mask[i] = key * SALT[i]; + } + + for (int i = 0; i < BITS_SET_PER_BLOCK; ++i) { + mask[i] = mask[i] >>> 27; + } + + for (int i = 0; i < BITS_SET_PER_BLOCK; ++i) { + mask[i] = 0x1 << mask[i]; + } + + return mask; + } + + @Override + public void insert(long hash) { + int bucketIndex = (int)(hash >> 32) & (bitset.length / BYTES_PER_FILTER_BLOCK - 1); + int key = (int)hash; + + // Calculate mask for bucket. + int mask[] = setMask(key); + + for (int i = 0; i < BITS_SET_PER_BLOCK; i++) { + int value = intBuffer.get(bucketIndex * (BYTES_PER_FILTER_BLOCK / 4) + i); + value |= mask[i]; + intBuffer.put(bucketIndex * (BYTES_PER_FILTER_BLOCK / 4) + i, value); + } + } + + @Override + public boolean find(long hash) { + int bucketIndex = (int)(hash >> 32) & (bitset.length / BYTES_PER_FILTER_BLOCK - 1); + int key = (int)hash; + + // Calculate mask for the tiny Bloom filter. + int mask[] = setMask(key); + + for (int i = 0; i < BITS_SET_PER_BLOCK; i++) { + if (0 == (intBuffer.get(bucketIndex * (BYTES_PER_FILTER_BLOCK / 4) + i) & mask[i])) { + return false; + } + } + + return true; + } + + /** + * Calculate optimal size according to the number of distinct values and false positive probability. + * + * @param n: The number of distinct values. + * @param p: The false positive probability. + * @return optimal number of bits of given n and p. + */ + public static int optimalNumOfBits(long n, double p) { + Preconditions.checkArgument((p > 0.0 && p < 1.0), + "FPP should be less than 1.0 and great than 0.0"); + + final double m = -8 * n / Math.log(1 - Math.pow(p, 1.0 / 8)); + final double MAX = MAXIMUM_BLOOM_FILTER_BYTES << 3; + int numBits = (int)m; + + // Handle overflow. + if (m > MAX || m < 0) { + numBits = (int)MAX; + } + + // Get next power of 2 if bits is not power of 2. + if ((numBits & (numBits - 1)) != 0) { + numBits = Integer.highestOneBit(numBits) << 1; + } + + if (numBits < (MINIMUM_BLOOM_FILTER_BYTES << 3)) { + numBits = MINIMUM_BLOOM_FILTER_BYTES << 3; + } + + return numBits; + } + + @Override + public long hash(int value) { + ByteBuffer plain = ByteBuffer.allocate(Integer.SIZE/Byte.SIZE); + plain.order(ByteOrder.LITTLE_ENDIAN).putInt(value); + return hashFunction.hashBytes(plain.array()).asLong(); + } + + @Override + public long hash(long value) { + ByteBuffer plain = ByteBuffer.allocate(Long.SIZE/Byte.SIZE); + plain.order(ByteOrder.LITTLE_ENDIAN).putLong(value); + return hashFunction.hashBytes(plain.array()).asLong(); + } + + @Override + public long hash(double value) { + ByteBuffer plain = ByteBuffer.allocate(Double.SIZE/Byte.SIZE); + plain.order(ByteOrder.LITTLE_ENDIAN).putDouble(value); + return hashFunction.hashBytes(plain.array()).asLong(); + } + + @Override + public long hash(float value) { + ByteBuffer plain = ByteBuffer.allocate(Float.SIZE/Byte.SIZE); + plain.order(ByteOrder.LITTLE_ENDIAN).putFloat(value); + return hashFunction.hashBytes(plain.array()).asLong(); + } + + @Override + public long hash(Binary value) { + return hashFunction.hashBytes(value.getBytes()).asLong(); + } + + @Override + public long getBitsetSize() { + return this.bitset.length; + } +} diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BloomFilter.java b/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BloomFilter.java index 4548617b1b..430fab8d61 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BloomFilter.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BloomFilter.java @@ -18,32 +18,18 @@ */ package org.apache.parquet.column.values.bloomfilter; -import com.google.common.hash.HashFunction; -import com.google.common.hash.Hashing; -import org.apache.parquet.Preconditions; -import org.apache.parquet.bytes.BytesUtils; import org.apache.parquet.io.api.Binary; import java.io.IOException; import java.io.OutputStream; -import java.nio.ByteBuffer; -import java.nio.ByteOrder; -import java.nio.IntBuffer; /** * A Bloom filter is a compact structure to indicate whether an item is not in a set or probably - * in a set. BloomFilter class stores a bit set represents a elements set, a hash strategy and a - * Bloom filter algorithm. - * - * This Bloom filter is implemented using block-based Bloom filter algorithm from Putze et al.'s - * "Cache-, Hash- and Space-Efficient Bloom filters". The basic idea is to hash the item to a tiny - * Bloom filter which size fit a single cache line or smaller. This implementation sets 8 bits in - * each tiny Bloom filter. Each tiny Bloom filter is 32 bytes to take advantage of 32-byte SIMD - * instruction. + * in a set. The Bloom filter usually consists of a bit set that represents a elements set, + * a hash strategy and a Bloom filter algorithm. */ - -public class BloomFilter { - // Bloom filter Hash strategy . +public abstract class BloomFilter { + // Bloom filter Hash strategy. public enum HashStrategy { MURMUR3_X64_128, } @@ -53,203 +39,21 @@ public enum Algorithm { BLOCK, } - // Bytes in a tiny Bloom filter block. - private static final int BYTES_PER_FILTER_BLOCK = 32; - - // Default seed for hash function, it comes from System.nanoTime(). - private static final int DEFAULT_SEED = 1361930890; - - // Minimum Bloom filter size, set to size of a tiny Bloom filter block - public static final int MINIMUM_BLOOM_FILTER_BYTES = 32; - - // Maximum Bloom filter size, it sets to default HDFS block size for upper boundary check - // This should be re-consider when implementing write side logic. - public static final int MAXIMUM_BLOOM_FILTER_BYTES = 128 * 1024 * 1024; - - // The number of bits to set in a tiny Bloom filter - private static final int BITS_SET_PER_BLOCK = 8; - - // The header of Bloom filter, it includes number of bytes, algorithm and hash enumeration. - public static final int HEADER_SIZE = 12; - - // Hash strategy used in this Bloom filter. - public final HashStrategy hashStrategy; - - // Algorithm used in this Bloom filter. - public final Algorithm algorithm; - - // The underlying byte array for Bloom filter bitset. - private byte[] bitset; - - // A integer array buffer of underlying bitset to help setting bits. - private IntBuffer intBuffer; - - // Hash function use to compute hash for column value. - private HashFunction hashFunction; - - // The block-based algorithm needs 8 odd SALT values to calculate eight index - // of bit to set, one bit in 32-bit word. - private static final int SALT[] = {0x47b6137b, 0x44974d91, 0x8824ad5b, 0xa2b7289d, - 0x705495c7, 0x2df1424b, 0x9efc4947, 0x5c6bfb31}; - - /** - * Constructor of Bloom filter. - * - * @param numBytes The number of bytes for Bloom filter bitset. The range of num_bytes should be within - * [MINIMUM_BLOOM_FILTER_BYTES, MAXIMUM_BLOOM_FILTER_BYTES], it will be rounded up/down - * to lower/upper bound if num_bytes is out of range and also will rounded up to a power - * of 2. It uses murmur3_x64_128 as its default hash function and block-based algorithm - * as default algorithm. - */ - public BloomFilter(int numBytes) { - this(numBytes, HashStrategy.MURMUR3_X64_128, Algorithm.BLOCK); - } - - /** - * Constructor of Bloom filter. It uses murmur3_x64_128 as its default hash - * function and block-based algorithm as its default algorithm. - * - * @param numBytes The number of bytes for Bloom filter bitset - * @param hashStrategy The hash strategy of Bloom filter. - * @param algorithm The algorithm of Bloom filter. - */ - private BloomFilter(int numBytes, HashStrategy hashStrategy, Algorithm algorithm) { - initBitset(numBytes); - - switch (hashStrategy) { - case MURMUR3_X64_128: - this.hashStrategy = hashStrategy; - hashFunction = Hashing.murmur3_128(DEFAULT_SEED); - break; - default: - throw new RuntimeException("Not supported hash strategy"); - } - - this.algorithm = algorithm; - } - - - /** - * Construct the Bloom filter with given bitset, it is used when reconstructing - * Bloom filter from parquet file. It use murmur3_x64_128 as its default hash - * function and block-based algorithm as default algorithm. - * - * @param bitset The given bitset to construct Bloom filter. - */ - public BloomFilter(byte[] bitset) { - this(bitset, HashStrategy.MURMUR3_X64_128, Algorithm.BLOCK); - } - - /** - * Construct the Bloom filter with given bitset, it is used when reconstructing - * Bloom filter from parquet file. - * - * @param bitset The given bitset to construct Bloom filter. - * @param hashStrategy The hash strategy Bloom filter apply. - * @param algorithm The algorithm of Bloom filter. - */ - private BloomFilter(byte[] bitset, HashStrategy hashStrategy, Algorithm algorithm) { - if (bitset == null) { - throw new RuntimeException("Given bitset is null"); - } - this.bitset = bitset; - this.intBuffer = ByteBuffer.wrap(bitset).order(ByteOrder.LITTLE_ENDIAN).asIntBuffer(); - - switch (hashStrategy) { - case MURMUR3_X64_128: - this.hashStrategy = hashStrategy; - hashFunction = Hashing.murmur3_128(DEFAULT_SEED); - break; - default: - throw new RuntimeException("Not supported hash strategy"); - } - this.algorithm = algorithm; - } - - /** - * Create a new bitset for Bloom filter. - * - * @param numBytes The number of bytes for Bloom filter bitset. The range of num_bytes should be within - * [MINIMUM_BLOOM_FILTER_BYTES, MAXIMUM_BLOOM_FILTER_BYTES], it will be rounded up/down - * to lower/upper bound if num_bytes is out of range and also will rounded up to a power - * of 2. It uses murmur3_x64_128 as its default hash function and block-based algorithm - * as default algorithm. - */ - private void initBitset(int numBytes) { - if (numBytes < MINIMUM_BLOOM_FILTER_BYTES) { - numBytes = MINIMUM_BLOOM_FILTER_BYTES; - } - - // Get next power of 2 if it is not power of 2. - if ((numBytes & (numBytes - 1)) != 0) { - numBytes = Integer.highestOneBit(numBytes) << 1; - } - - if (numBytes > MAXIMUM_BLOOM_FILTER_BYTES || numBytes < 0) { - numBytes = MAXIMUM_BLOOM_FILTER_BYTES; - } - - this.bitset = new byte[numBytes]; - this.intBuffer = ByteBuffer.wrap(bitset).order(ByteOrder.LITTLE_ENDIAN).asIntBuffer(); - } - /** * Write the Bloom filter to an output stream. It writes the Bloom filter header includes the * bitset's length in size of byte, the hash strategy, the algorithm, and the bitset. * * @param out the output stream to write */ - public void writeTo(OutputStream out) throws IOException { - // Write number of bytes of bitset. - out.write(BytesUtils.intToBytes(bitset.length)); - - // Write hash strategy - out.write(BytesUtils.intToBytes(this.hashStrategy.ordinal())); - - // Write algorithm - out.write(BytesUtils.intToBytes(this.algorithm.ordinal())); - - // Write bitset - out.write(bitset); - } - - private int[] setMask(int key) { - int mask[] = new int[BITS_SET_PER_BLOCK]; - - for (int i = 0; i < BITS_SET_PER_BLOCK; ++i) { - mask[i] = key * SALT[i]; - } - - for (int i = 0; i < BITS_SET_PER_BLOCK; ++i) { - mask[i] = mask[i] >>> 27; - } - - for (int i = 0; i < BITS_SET_PER_BLOCK; ++i) { - mask[i] = 0x1 << mask[i]; - } - - return mask; - } + public abstract void writeTo(OutputStream out) throws IOException; /** - * Add an element to Bloom filter, the element content is represented by + * Insert an element to the Bloom filter, the element content is represented by * the hash value of its plain encoding result. * * @param hash the hash result of element. */ - public void insert(long hash) { - int bucketIndex = (int)(hash >> 32) & (bitset.length / BYTES_PER_FILTER_BLOCK - 1); - int key = (int)hash; - - // Calculate mask for bucket. - int mask[] = setMask(key); - - for (int i = 0; i < BITS_SET_PER_BLOCK; i++) { - int value = intBuffer.get(bucketIndex * (BYTES_PER_FILTER_BLOCK / 4) + i); - value |= mask[i]; - intBuffer.put(bucketIndex * (BYTES_PER_FILTER_BLOCK / 4) + i, value); - } - } + public abstract void insert(long hash); /** * Determine whether an element is in set or not. @@ -257,53 +61,7 @@ public void insert(long hash) { * @param hash the hash value of element plain encoding result. * @return false if element is must not in set, true if element probably in set. */ - public boolean find(long hash) { - int bucketIndex = (int)(hash >> 32) & (bitset.length / BYTES_PER_FILTER_BLOCK - 1); - int key = (int)hash; - - // Calculate mask for the tiny Bloom filter. - int mask[] = setMask(key); - - for (int i = 0; i < BITS_SET_PER_BLOCK; i++) { - if (0 == (intBuffer.get(bucketIndex * (BYTES_PER_FILTER_BLOCK / 4) + i) & mask[i])) { - return false; - } - } - - return true; - } - - /** - * Calculate optimal size according to the number of distinct values and false positive probability. - * - * @param n: The number of distinct values. - * @param p: The false positive probability. - * @return optimal number of bits of given n and p. - */ - public static int optimalNumOfBits(long n, double p) { - Preconditions.checkArgument((p > 0.0 && p < 1.0), - "FPP should be less than 1.0 and great than 0.0"); - - final double m = -8 * n / Math.log(1 - Math.pow(p, 1.0 / 8)); - final double MAX = MAXIMUM_BLOOM_FILTER_BYTES << 3; - int numBits = (int)m; - - // Handle overflow. - if (m > MAX || m < 0) { - numBits = (int)MAX; - } - - // Get next power of 2 if bits is not power of 2. - if ((numBits & (numBits - 1)) != 0) { - numBits = Integer.highestOneBit(numBits) << 1; - } - - if (numBits < (MINIMUM_BLOOM_FILTER_BYTES << 3)) { - numBits = MINIMUM_BLOOM_FILTER_BYTES << 3; - } - - return numBits; - } + public abstract boolean find(long hash); /** * Compute hash for int value by using its plain encoding result. @@ -311,11 +69,7 @@ public static int optimalNumOfBits(long n, double p) { * @param value the value to hash * @return hash result */ - public long hash(int value) { - ByteBuffer plain = ByteBuffer.allocate(Integer.SIZE/Byte.SIZE); - plain.order(ByteOrder.LITTLE_ENDIAN).putInt(value); - return hashFunction.hashBytes(plain.array()).asLong(); - } + public abstract long hash(int value); /** * Compute hash for long value by using its plain encoding result. @@ -323,11 +77,7 @@ public long hash(int value) { * @param value the value to hash * @return hash result */ - public long hash(long value) { - ByteBuffer plain = ByteBuffer.allocate(Long.SIZE/Byte.SIZE); - plain.order(ByteOrder.LITTLE_ENDIAN).putLong(value); - return hashFunction.hashBytes(plain.array()).asLong(); - } + public abstract long hash(long value) ; /** * Compute hash for double value by using its plain encoding result. @@ -335,11 +85,7 @@ public long hash(long value) { * @param value the value to hash * @return hash result */ - public long hash(double value) { - ByteBuffer plain = ByteBuffer.allocate(Double.SIZE/Byte.SIZE); - plain.order(ByteOrder.LITTLE_ENDIAN).putDouble(value); - return hashFunction.hashBytes(plain.array()).asLong(); - } + public abstract long hash(double value); /** * Compute hash for float value by using its plain encoding result. @@ -347,27 +93,19 @@ public long hash(double value) { * @param value the value to hash * @return hash result */ - public long hash(float value) { - ByteBuffer plain = ByteBuffer.allocate(Float.SIZE/Byte.SIZE); - plain.order(ByteOrder.LITTLE_ENDIAN).putFloat(value); - return hashFunction.hashBytes(plain.array()).asLong(); - } - + public abstract long hash(float value); /** * Compute hash for Binary value by using its plain encoding result. * * @param value the value to hash * @return hash result */ - public long hash(Binary value) { - return hashFunction.hashBytes(value.toByteBuffer()).asLong(); - } + public abstract long hash(Binary value); /** - * Get allocated buffer size. - * @return size in byte. + * Get the number of bytes for bitset in this Bloom filter. + * + * @return The number of bytes for bitset in this Bloom filter. */ - public long getBufferedSize() { - return this.bitset.length; - } + public abstract long getBitsetSize(); } diff --git a/parquet-column/src/test/java/org/apache/parquet/column/values/bloomfilter/TestBlockSplitBloomFilter.java b/parquet-column/src/test/java/org/apache/parquet/column/values/bloomfilter/TestBlockSplitBloomFilter.java new file mode 100644 index 0000000000..542b9cd25a --- /dev/null +++ b/parquet-column/src/test/java/org/apache/parquet/column/values/bloomfilter/TestBlockSplitBloomFilter.java @@ -0,0 +1,129 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.parquet.column.values.bloomfilter; + + + import java.io.File; + import java.io.FileInputStream; + import java.io.FileOutputStream; + import java.io.IOException; + import java.nio.ByteBuffer; + import java.nio.ByteOrder; + import java.util.ArrayList; + import java.util.List; + import java.util.Random; + + import jdk.nashorn.internal.ir.Block; + import org.apache.parquet.column.values.RandomStr; + import org.apache.parquet.io.api.Binary; + import org.junit.Rule; + import org.junit.Test; + import org.junit.rules.TemporaryFolder; + + import static org.junit.Assert.assertEquals; + import static org.junit.Assert.assertTrue; + +public class TestBlockSplitBloomFilter { + + @Test + public void testConstructor () throws IOException { + BloomFilter bloomFilter1 = new BlockSplitBloomFilter(0); + assertEquals(bloomFilter1.getBitsetSize(), BlockSplitBloomFilter.MINIMUM_BLOOM_FILTER_BYTES); + + BloomFilter bloomFilter2 = new BlockSplitBloomFilter(256 * 1024 * 1024); + assertEquals(bloomFilter2.getBitsetSize(), BlockSplitBloomFilter.MAXIMUM_BLOOM_FILTER_BYTES); + + BloomFilter bloomFilter3 = new BlockSplitBloomFilter(1000); + assertEquals(bloomFilter3.getBitsetSize(), 1024); + } + + @Rule + public final TemporaryFolder temp = new TemporaryFolder(); + /* + * This test is used to test basic operations including inserting, finding and + * serializing and de-serializing. + */ + @Test + public void testBasic () throws IOException { + final String testStrings[] = {"hello", "parquet", "bloom", "filter"}; + BloomFilter bloomFilter = new BlockSplitBloomFilter(1024); + + for(int i = 0; i < testStrings.length; i++) { + bloomFilter.insert(bloomFilter.hash(Binary.fromString(testStrings[i]))); + } + + File testFile = temp.newFile(); + FileOutputStream fileOutputStream = new FileOutputStream(testFile); + bloomFilter.writeTo(fileOutputStream); + fileOutputStream.close(); + + FileInputStream fileInputStream = new FileInputStream(testFile); + + byte[] value = new byte[4]; + + fileInputStream.read(value); + int length = ByteBuffer.wrap(value).order(ByteOrder.LITTLE_ENDIAN).getInt(); + assertEquals(length, 1024); + + fileInputStream.read(value); + int hash = ByteBuffer.wrap(value).order(ByteOrder.LITTLE_ENDIAN).getInt(); + assertEquals(hash, BloomFilter.HashStrategy.MURMUR3_X64_128.ordinal()); + + fileInputStream.read(value); + int algorithm = ByteBuffer.wrap(value).order(ByteOrder.LITTLE_ENDIAN).getInt(); + assertEquals(algorithm, BloomFilter.Algorithm.BLOCK.ordinal()); + + byte[] bitset = new byte[length]; + fileInputStream.read(bitset); + bloomFilter = new BlockSplitBloomFilter(bitset); + + for(int i = 0; i < testStrings.length; i++) { + assertTrue(bloomFilter.find(bloomFilter.hash(Binary.fromString(testStrings[i])))); + } + } + + @Test + public void testFPP() throws IOException { + final int totalCount = 100000; + final double FPP = 0.01; + final long SEED = 104729; + + BloomFilter bloomFilter = new BlockSplitBloomFilter(BlockSplitBloomFilter.optimalNumOfBits(totalCount, FPP)); + List strings = new ArrayList<>(); + RandomStr randomStr = new RandomStr(new Random(SEED)); + for(int i = 0; i < totalCount; i++) { + String str = randomStr.get(10); + strings.add(str); + bloomFilter.insert(bloomFilter.hash(Binary.fromString(str))); + } + + // The exist counts the number of times FindHash returns true. + int exist = 0; + for (int i = 0; i < totalCount; i++) { + String str = randomStr.get(8); + if (bloomFilter.find(bloomFilter.hash(Binary.fromString(str)))) { + exist ++; + } + } + + // The exist should be probably less than 1000 according FPP 0.01. + assertTrue(exist < totalCount * FPP); + } +} diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageWriteStore.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageWriteStore.java index 58b1450dfb..caa41fc7c0 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageWriteStore.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageWriteStore.java @@ -36,6 +36,7 @@ import org.apache.parquet.column.page.PageWriteStore; import org.apache.parquet.column.page.PageWriter; import org.apache.parquet.column.statistics.Statistics; +import org.apache.parquet.column.values.bloomfilter.BlockSplitBloomFilter; import org.apache.parquet.column.values.bloomfilter.BloomFilter; import org.apache.parquet.column.values.bloomfilter.BloomFilterWriteStore; import org.apache.parquet.column.values.bloomfilter.BloomFilterWriter; diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java index a7d07ba007..3975bf9f48 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java @@ -60,6 +60,7 @@ import org.apache.parquet.column.ColumnDescriptor; import org.apache.parquet.column.Encoding; import org.apache.parquet.column.page.DictionaryPageReadStore; +import org.apache.parquet.column.values.bloomfilter.BlockSplitBloomFilter; import org.apache.parquet.compression.CompressionCodecFactory.BytesInputDecompressor; import org.apache.parquet.filter2.compat.FilterCompat; import org.apache.parquet.filter2.compat.RowGroupFilter; @@ -930,7 +931,7 @@ public BloomFilter readBloomFilter(ColumnChunkMetaData meta) throws IOException f.seek(bloomFilterOffset); // Read Bloom filter data header. - byte[] bytes = new byte[BloomFilter.HEADER_SIZE]; + byte[] bytes = new byte[BlockSplitBloomFilter.HEADER_SIZE]; f.read(bytes); ByteBuffer bloomHeader = ByteBuffer.wrap(bytes); IntBuffer headerBuffer = bloomHeader.order(ByteOrder.LITTLE_ENDIAN).asIntBuffer(); @@ -942,7 +943,7 @@ public BloomFilter readBloomFilter(ColumnChunkMetaData meta) throws IOException byte[] bitset = new byte[numBytes]; f.readFully(bitset); - return new BloomFilter(bitset); + return new BlockSplitBloomFilter(bitset); } @Override diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetOutputFormat.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetOutputFormat.java index 6e191b005c..ffcf5c6a32 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetOutputFormat.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetOutputFormat.java @@ -1,4 +1,4 @@ -/* +/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -6,9 +6,9 @@ * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -144,7 +144,7 @@ public static enum JobSummaryLevel { public static final String MAX_ROW_COUNT_FOR_PAGE_SIZE_CHECK = "parquet.page.size.row.check.max"; public static final String ESTIMATE_PAGE_SIZE_CHECK = "parquet.page.size.check.estimate"; public static final String BLOOM_FILTER_COLUMN_NAMES = "parquet.bloom.filter.column.names"; - public static final String BLOOM_FILTER_SIZES = "parquet.bloom.filter.size"; + public static final String BLOOM_FILTER_EXPECT_DISTINCT_NUMBERS = "parquet.bloom.filter.expected.distinct.numbers"; public static final String ENABLE_BLOOM_FILTER = "parquet.enable.bloom.filter"; public static JobSummaryLevel getJobSummaryLevel(Configuration conf) { @@ -261,8 +261,8 @@ public static boolean getEnableBloomFilter(Configuration configuration) { ParquetProperties.DEFAULT_BLOOM_FILTER_ENABLED); } - public static String getBloomFilterSizes(Configuration configuration) { - return configuration.get(BLOOM_FILTER_SIZES); + public static String getBloomFilterExpectedDistinctNumbers(Configuration configuration) { + return configuration.get(BLOOM_FILTER_EXPECT_DISTINCT_NUMBERS); } public static int getMinRowCountForPageSizeCheck(Configuration configuration) { @@ -386,7 +386,7 @@ public RecordWriter getRecordWriter(Configuration conf, Path file, Comp .withPageSize(getPageSize(conf)) .withDictionaryPageSize(getDictionaryPageSize(conf)) .withBloomFilterEnabled(getEnableBloomFilter(conf)) - .withBloomFilterInfo(getBloomFilterColumnNames(conf), getBloomFilterSizes(conf)) + .withBloomFilterInfo(getBloomFilterColumnNames(conf), getBloomFilterExpectedDistinctNumbers(conf)) .withDictionaryEncoding(getEnableDictionary(conf)) .withWriterVersion(getWriterVersion(conf)) .estimateRowCountForPageSizeCheck(getEstimatePageSizeCheck(conf)) @@ -410,8 +410,8 @@ public RecordWriter getRecordWriter(Configuration conf, Path file, Comp LOG.info("Min row count for page size check is: {}", props.getMinRowCountForPageSizeCheck()); LOG.info("Max row count for page size check is: {}", props.getMaxRowCountForPageSizeCheck()); LOG.info("Parquet Bloom Filter is {}", props.isBloomFilterEnabled()? "on": "off"); - LOG.info("Parquet Bloom filter column names are: {}", props.getBloomFilterInfo().keySet()); - LOG.info("Parquet Bloom filter column sizes are: {}", props.getBloomFilterInfo().values()); + LOG.info("Parquet Bloom filter column names are: {}", props.getBloomFilterExpectValues().keySet()); + LOG.info("Parquet Bloom filter column expect distinct values are: {}", props.getBloomFilterExpectValues().values()); } WriteContext init = writeSupport.init(conf); diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java index 636515dc8f..535394b370 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java @@ -1,4 +1,4 @@ -/* +/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -6,9 +6,9 @@ * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -153,7 +153,7 @@ public void testBloomWriteRead() throws Exception { w.startColumn(col, 5, CODEC); w.writeDataPage(2, 4, BytesInput.from(BYTES1),stats1, BIT_PACKED, BIT_PACKED, PLAIN); w.writeDataPage(3, 4, BytesInput.from(BYTES1),stats1, BIT_PACKED, BIT_PACKED, PLAIN); - BloomFilter bloomData = new BloomFilter(0); + BloomFilter bloomData = new BlockSplitBloomFilter(0); bloomData.insert(bloomData.hash(Binary.fromString("hello"))); bloomData.insert(bloomData.hash(Binary.fromString("world"))); long blStarts = w.getPos(); diff --git a/pom.xml b/pom.xml index ee8ae94fb7..6e6902847b 100644 --- a/pom.xml +++ b/pom.xml @@ -81,7 +81,7 @@ 1.2.1 2.7.1 3.1.2 - 2.4.0 + 2.7.0-SNAPSHOT 1.7.0 thrift 2.10.6 @@ -96,7 +96,7 @@ 0.9.33 1.7.22 1.8.2 - 24.0-jre + 20.0 0.1.1 1.10.19 From e3991eeefc9f6f542a0f5ac5e531b81152812ae1 Mon Sep 17 00:00:00 2001 From: Junjie Chen Date: Sun, 21 Oct 2018 21:29:35 +0800 Subject: [PATCH 3/9] Rebase to latest master --- .travis.yml | 2 +- parquet-arrow/pom.xml | 2 +- .../parquet/arrow/schema/SchemaConverter.java | 269 +-- .../arrow/schema/TestSchemaConverter.java | 61 +- parquet-avro/pom.xml | 4 +- .../parquet/avro/AvroSchemaConverter.java | 154 +- .../parquet/avro/TestAvroSchemaConverter.java | 14 +- .../cascading/convert/TupleConverter.java | 9 +- .../java/org/apache/parquet/cli/Main.java | 2 + .../java/org/apache/parquet/cli/Util.java | 10 + .../cli/commands/ParquetMetadataCommand.java | 4 +- .../cli/commands/ShowColumnIndexCommand.java | 157 ++ .../cli/commands/ShowDictionaryCommand.java | 4 +- .../cli/commands/ShowPagesCommand.java | 4 +- .../apache/parquet/column/ColumnReader.java | 3 + .../parquet/column/ParquetProperties.java | 125 +- .../column/impl/ColumnReadStoreImpl.java | 14 +- .../parquet/column/impl/ColumnReaderBase.java | 760 ++++++++ .../parquet/column/impl/ColumnReaderImpl.java | 676 +------ .../column/impl/ColumnWriteStoreBase.java | 255 +++ .../column/impl/ColumnWriteStoreV1.java | 129 +- .../column/impl/ColumnWriteStoreV2.java | 176 +- .../parquet/column/impl/ColumnWriterBase.java | 400 +++++ .../parquet/column/impl/ColumnWriterV1.java | 323 +--- .../parquet/column/impl/ColumnWriterV2.java | 348 +--- .../impl/SynchronizingColumnReader.java | 111 ++ .../apache/parquet/column/page/DataPage.java | 22 + .../parquet/column/page/DataPageV1.java | 31 + .../parquet/column/page/DataPageV2.java | 52 + .../parquet/column/page/PageReadStore.java | 15 +- .../parquet/column/page/PageWriter.java | 17 +- .../parquet/column/values/ValuesReader.java | 12 + .../bloomfilter/BlockSplitBloomFilter.java | 22 +- .../values/bloomfilter/BloomFilter.java | 2 +- .../bloomfilter/BloomFilterReadStore.java | 3 - .../values/bloomfilter/BloomFilterReader.java | 1 + .../bloomfilter/BloomFilterWriteStore.java | 1 + .../values/bloomfilter/BloomFilterWriter.java | 8 +- .../delta/DeltaBinaryPackingValuesReader.java | 8 + .../DeltaLengthByteArrayValuesReader.java | 12 +- .../FixedLenByteArrayPlainValuesReader.java | 8 +- .../values/plain/PlainValuesReader.java | 36 +- ...RunLengthBitPackingHybridValuesWriter.java | 9 +- .../values/rle/ZeroIntegerValuesReader.java | 4 + .../filter2/predicate/ValidTypeMap.java | 7 +- .../columnindex/BinaryColumnIndexBuilder.java | 140 ++ .../column/columnindex/BinaryTruncator.java | 208 +++ .../BooleanColumnIndexBuilder.java | 133 ++ .../column/columnindex/BoundaryOrder.java | 352 ++++ .../column/columnindex/ColumnIndex.java | 60 + .../columnindex/ColumnIndexBuilder.java | 636 +++++++ .../columnindex/DoubleColumnIndexBuilder.java | 155 ++ .../columnindex/FloatColumnIndexBuilder.java | 155 ++ .../column/columnindex/IndexIterator.java | 98 ++ .../columnindex/IntColumnIndexBuilder.java | 136 ++ .../columnindex/LongColumnIndexBuilder.java | 136 ++ .../column/columnindex/OffsetIndex.java | 64 + .../columnindex/OffsetIndexBuilder.java | 175 ++ .../columnindex/ColumnIndexFilter.java | 194 +++ .../filter2/columnindex/ColumnIndexStore.java | 55 + .../filter2/columnindex/RowRanges.java | 288 +++ .../parquet/schema/ConversionPatterns.java | 28 +- .../org/apache/parquet/schema/GroupType.java | 36 +- .../parquet/schema/LogicalTypeAnnotation.java | 140 +- .../apache/parquet/schema/MessageType.java | 8 +- .../apache/parquet/schema/OriginalType.java | 66 +- .../parquet/schema/PrimitiveComparator.java | 6 +- .../parquet/schema/PrimitiveStringifier.java | 130 +- .../apache/parquet/schema/PrimitiveType.java | 213 ++- .../java/org/apache/parquet/schema/Types.java | 137 +- .../column/impl/TestColumnReaderImpl.java | 8 +- .../parquet/column/mem/TestMemColumn.java | 12 +- .../column/page/mem/MemPageWriter.java | 6 + .../bitpacking/TestBitPackingColumn.java | 16 + .../TestBlockSplitBloomFilter.java | 46 +- ...naryPackingValuesWriterForIntegerTest.java | 17 + ...aBinaryPackingValuesWriterForLongTest.java | 17 + .../TestDeltaLengthByteArray.java | 24 + .../deltastrings/TestDeltaByteArray.java | 19 + .../values/dictionary/TestDictionary.java | 105 ++ .../filter2/predicate/TestValidTypeMap.java | 7 +- .../columnindex/TestBinaryTruncator.java | 285 +++ .../column/columnindex/TestBoundaryOrder.java | 487 ++++++ .../columnindex/TestColumnIndexBuilder.java | 1546 +++++++++++++++++ .../column/columnindex/TestIndexIterator.java | 63 + .../columnindex/TestOffsetIndexBuilder.java | 113 ++ .../columnindex/TestColumnIndexFilter.java | 464 +++++ .../filter2/columnindex/TestRowRanges.java | 155 ++ .../parquet/parser/TestParquetParser.java | 5 + .../parquet/schema/TestMessageType.java | 2 +- .../schema/TestPrimitiveComparator.java | 19 + .../schema/TestPrimitiveStringifier.java | 144 +- .../TestTypeBuildersWithLogicalTypes.java | 408 +++++ parquet-common/pom.xml | 10 +- parquet-format-structures/pom.xml | 206 +++ .../parquet/format/InterningProtocol.java | 231 +++ .../apache/parquet/format/LogicalTypes.java | 55 + .../java/org/apache/parquet/format/Util.java | 236 +++ .../parquet/format/event/Consumers.java | 193 ++ .../format/event/EventBasedThriftReader.java | 126 ++ .../parquet/format/event/FieldConsumer.java | 39 + .../parquet/format/event/TypedConsumer.java | 205 +++ .../org/apache/parquet/format/TestUtil.java | 83 + parquet-hadoop/pom.xml | 4 +- .../org/apache/parquet/HadoopReadOptions.java | 9 +- .../apache/parquet/ParquetReadOptions.java | 20 +- .../converter/ParquetMetadataConverter.java | 357 ++-- .../parquet/hadoop/BloomFilterDataReader.java | 9 - .../hadoop/ColumnChunkPageReadStore.java | 115 +- .../hadoop/ColumnChunkPageWriteStore.java | 84 +- .../hadoop/ColumnIndexFilterUtils.java | 157 ++ .../parquet/hadoop/ColumnIndexStoreImpl.java | 155 ++ .../hadoop/InternalParquetRecordReader.java | 6 +- .../hadoop/InternalParquetRecordWriter.java | 8 +- .../parquet/hadoop/ParquetFileReader.java | 432 ++++- .../parquet/hadoop/ParquetFileWriter.java | 335 +++- .../parquet/hadoop/ParquetInputFormat.java | 11 +- .../parquet/hadoop/ParquetOutputFormat.java | 52 +- .../apache/parquet/hadoop/ParquetReader.java | 10 + .../apache/parquet/hadoop/ParquetWriter.java | 2 +- .../hadoop/metadata/ColumnChunkMetaData.java | 107 +- .../parquet/hadoop/util/BlocksCombiner.java | 106 ++ .../hadoop/metadata/IndexReference.java | 41 + .../filter2/recordlevel/PhoneBookWriter.java | 105 +- .../TestParquetMetadataConverter.java | 74 + .../hadoop/TestColumnChunkPageWriteStore.java | 94 +- .../hadoop/TestColumnIndexFiltering.java | 442 +++++ .../parquet/hadoop/TestParquetFileWriter.java | 224 ++- .../hadoop/TestParquetWriterMergeBlocks.java | 280 +++ .../parquet/convert/HiveSchemaConverter.java | 17 +- parquet-pig/pom.xml | 4 +- .../parquet/pig/PigSchemaConverter.java | 124 +- .../parquet/pig/convert/TupleConverter.java | 31 +- parquet-protobuf/pom.xml | 11 + .../parquet/proto/ProtoMessageConverter.java | 43 +- .../parquet/proto/ProtoSchemaConverter.java | 45 +- .../parquet/proto/ProtoWriteSupport.java | 29 +- parquet-thrift/pom.xml | 11 + .../thrift/ThriftSchemaConvertVisitor.java | 18 +- parquet-tools/pom.xml | 4 +- .../tools/command/ColumnIndexCommand.java | 182 ++ .../parquet/tools/command/DumpCommand.java | 1 - .../parquet/tools/command/MergeCommand.java | 75 +- .../parquet/tools/command/MetadataUtils.java | 212 +++ .../parquet/tools/command/Registry.java | 1 + .../tools/command/ShowMetaCommand.java | 29 +- .../tools/command/ShowSchemaCommand.java | 14 +- .../tools/read/SimpleRecordConverter.java | 66 +- .../parquet/tools/util/MetadataUtils.java | 9 +- pom.xml | 18 +- 150 files changed, 14764 insertions(+), 2806 deletions(-) create mode 100644 parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowColumnIndexCommand.java create mode 100644 parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnReaderBase.java create mode 100644 parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriteStoreBase.java create mode 100644 parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterBase.java create mode 100644 parquet-column/src/main/java/org/apache/parquet/column/impl/SynchronizingColumnReader.java create mode 100644 parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/BinaryColumnIndexBuilder.java create mode 100644 parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/BinaryTruncator.java create mode 100644 parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/BooleanColumnIndexBuilder.java create mode 100644 parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/BoundaryOrder.java create mode 100644 parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/ColumnIndex.java create mode 100644 parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/ColumnIndexBuilder.java create mode 100644 parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/DoubleColumnIndexBuilder.java create mode 100644 parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/FloatColumnIndexBuilder.java create mode 100644 parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/IndexIterator.java create mode 100644 parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/IntColumnIndexBuilder.java create mode 100644 parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/LongColumnIndexBuilder.java create mode 100644 parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/OffsetIndex.java create mode 100644 parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/OffsetIndexBuilder.java create mode 100644 parquet-column/src/main/java/org/apache/parquet/internal/filter2/columnindex/ColumnIndexFilter.java create mode 100644 parquet-column/src/main/java/org/apache/parquet/internal/filter2/columnindex/ColumnIndexStore.java create mode 100644 parquet-column/src/main/java/org/apache/parquet/internal/filter2/columnindex/RowRanges.java create mode 100644 parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestBinaryTruncator.java create mode 100644 parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestBoundaryOrder.java create mode 100644 parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestColumnIndexBuilder.java create mode 100644 parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestIndexIterator.java create mode 100644 parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestOffsetIndexBuilder.java create mode 100644 parquet-column/src/test/java/org/apache/parquet/internal/filter2/columnindex/TestColumnIndexFilter.java create mode 100644 parquet-column/src/test/java/org/apache/parquet/internal/filter2/columnindex/TestRowRanges.java create mode 100644 parquet-column/src/test/java/org/apache/parquet/schema/TestTypeBuildersWithLogicalTypes.java create mode 100644 parquet-format-structures/pom.xml create mode 100644 parquet-format-structures/src/main/java/org/apache/parquet/format/InterningProtocol.java create mode 100644 parquet-format-structures/src/main/java/org/apache/parquet/format/LogicalTypes.java create mode 100644 parquet-format-structures/src/main/java/org/apache/parquet/format/Util.java create mode 100644 parquet-format-structures/src/main/java/org/apache/parquet/format/event/Consumers.java create mode 100644 parquet-format-structures/src/main/java/org/apache/parquet/format/event/EventBasedThriftReader.java create mode 100644 parquet-format-structures/src/main/java/org/apache/parquet/format/event/FieldConsumer.java create mode 100644 parquet-format-structures/src/main/java/org/apache/parquet/format/event/TypedConsumer.java create mode 100644 parquet-format-structures/src/test/java/org/apache/parquet/format/TestUtil.java create mode 100644 parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnIndexFilterUtils.java create mode 100644 parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnIndexStoreImpl.java create mode 100644 parquet-hadoop/src/main/java/org/apache/parquet/hadoop/util/BlocksCombiner.java create mode 100644 parquet-hadoop/src/main/java/org/apache/parquet/internal/hadoop/metadata/IndexReference.java create mode 100644 parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestColumnIndexFiltering.java create mode 100644 parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetWriterMergeBlocks.java create mode 100644 parquet-tools/src/main/java/org/apache/parquet/tools/command/ColumnIndexCommand.java create mode 100644 parquet-tools/src/main/java/org/apache/parquet/tools/command/MetadataUtils.java diff --git a/.travis.yml b/.travis.yml index da6a6ac80e..7ab4846c77 100644 --- a/.travis.yml +++ b/.travis.yml @@ -35,4 +35,4 @@ env: - HADOOP_PROFILE=default TEST_CODECS=gzip,snappy install: mvn install --batch-mode -DskipTests=true -Dmaven.javadoc.skip=true -Dsource.skip=true | pv -fbi 60 > mvn_install.log || (cat mvn_install.log && false) -script: mvn test -P $HADOOP_PROFILE +script: mvn verify -P $HADOOP_PROFILE diff --git a/parquet-arrow/pom.xml b/parquet-arrow/pom.xml index 232167ecb3..e0f305acbb 100644 --- a/parquet-arrow/pom.xml +++ b/parquet-arrow/pom.xml @@ -33,7 +33,7 @@ https://parquet.apache.org - 0.8.0 + 0.10.0 diff --git a/parquet-arrow/src/main/java/org/apache/parquet/arrow/schema/SchemaConverter.java b/parquet-arrow/src/main/java/org/apache/parquet/arrow/schema/SchemaConverter.java index a7df48cee0..51057c589e 100644 --- a/parquet-arrow/src/main/java/org/apache/parquet/arrow/schema/SchemaConverter.java +++ b/parquet-arrow/src/main/java/org/apache/parquet/arrow/schema/SchemaConverter.java @@ -19,22 +19,17 @@ package org.apache.parquet.arrow.schema; import static java.util.Arrays.asList; -import static org.apache.parquet.schema.OriginalType.DATE; -import static org.apache.parquet.schema.OriginalType.DECIMAL; -import static org.apache.parquet.schema.OriginalType.INTERVAL; -import static org.apache.parquet.schema.OriginalType.INT_16; -import static org.apache.parquet.schema.OriginalType.INT_32; -import static org.apache.parquet.schema.OriginalType.INT_64; -import static org.apache.parquet.schema.OriginalType.INT_8; -import static org.apache.parquet.schema.OriginalType.TIMESTAMP_MILLIS; -import static org.apache.parquet.schema.OriginalType.TIMESTAMP_MICROS; -import static org.apache.parquet.schema.OriginalType.TIME_MILLIS; -import static org.apache.parquet.schema.OriginalType.TIME_MICROS; -import static org.apache.parquet.schema.OriginalType.UINT_16; -import static org.apache.parquet.schema.OriginalType.UINT_32; -import static org.apache.parquet.schema.OriginalType.UINT_64; -import static org.apache.parquet.schema.OriginalType.UINT_8; -import static org.apache.parquet.schema.OriginalType.UTF8; +import static java.util.Optional.empty; +import static java.util.Optional.of; +import static org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit.MICROS; +import static org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit.MILLIS; +import static org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit.NANOS; +import static org.apache.parquet.schema.LogicalTypeAnnotation.dateType; +import static org.apache.parquet.schema.LogicalTypeAnnotation.decimalType; +import static org.apache.parquet.schema.LogicalTypeAnnotation.intType; +import static org.apache.parquet.schema.LogicalTypeAnnotation.stringType; +import static org.apache.parquet.schema.LogicalTypeAnnotation.timeType; +import static org.apache.parquet.schema.LogicalTypeAnnotation.timestampType; import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY; import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BOOLEAN; import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.DOUBLE; @@ -48,6 +43,7 @@ import java.util.ArrayList; import java.util.List; +import java.util.Optional; import org.apache.arrow.vector.types.DateUnit; import org.apache.arrow.vector.types.FloatingPointPrecision; @@ -75,10 +71,9 @@ import org.apache.parquet.arrow.schema.SchemaMapping.StructTypeMapping; import org.apache.parquet.arrow.schema.SchemaMapping.TypeMapping; import org.apache.parquet.arrow.schema.SchemaMapping.UnionTypeMapping; -import org.apache.parquet.schema.DecimalMetadata; import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.LogicalTypeAnnotation; import org.apache.parquet.schema.MessageType; -import org.apache.parquet.schema.OriginalType; import org.apache.parquet.schema.PrimitiveType; import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; import org.apache.parquet.schema.Type; @@ -180,13 +175,11 @@ public TypeMapping visit(Int type) { boolean signed = type.getIsSigned(); switch (type.getBitWidth()) { case 8: - return primitive(INT32, signed ? INT_8 : UINT_8); case 16: - return primitive(INT32, signed ? INT_16 : UINT_16); case 32: - return primitive(INT32, signed ? INT_32 : UINT_32); + return primitive(INT32, intType(type.getBitWidth(), signed)); case 64: - return primitive(INT64, signed ? INT_64 : UINT_64); + return primitive(INT64, intType(64, signed)); default: throw new IllegalArgumentException("Illegal int type: " + field); } @@ -209,7 +202,7 @@ public TypeMapping visit(FloatingPoint type) { @Override public TypeMapping visit(Utf8 type) { - return primitive(BINARY, UTF8); + return primitive(BINARY, stringType()); } @Override @@ -243,7 +236,7 @@ public TypeMapping visit(Decimal type) { @Override public TypeMapping visit(Date type) { - return primitive(INT32, DATE); + return primitive(INT32, dateType()); } @Override @@ -251,9 +244,11 @@ public TypeMapping visit(Time type) { int bitWidth = type.getBitWidth(); TimeUnit timeUnit = type.getUnit(); if (bitWidth == 32 && timeUnit == TimeUnit.MILLISECOND) { - return primitive(INT32, TIME_MILLIS); + return primitive(INT32, timeType(false, MILLIS)); } else if (bitWidth == 64 && timeUnit == TimeUnit.MICROSECOND) { - return primitive(INT64, TIME_MICROS); + return primitive(INT64, timeType(false, MICROS)); + } else if (bitWidth == 64 && timeUnit == TimeUnit.NANOSECOND) { + return primitive(INT64, timeType(false, NANOS)); } throw new UnsupportedOperationException("Unsupported type " + type); } @@ -262,20 +257,32 @@ public TypeMapping visit(Time type) { public TypeMapping visit(Timestamp type) { TimeUnit timeUnit = type.getUnit(); if (timeUnit == TimeUnit.MILLISECOND) { - return primitive(INT64, TIMESTAMP_MILLIS); + return primitive(INT64, timestampType(isUtcNormalized(type), MILLIS)); } else if (timeUnit == TimeUnit.MICROSECOND) { - return primitive(INT64, TIMESTAMP_MICROS); + return primitive(INT64, timestampType(isUtcNormalized(type), MICROS)); + } else if (timeUnit == TimeUnit.NANOSECOND) { + return primitive(INT64, timestampType(isUtcNormalized(type), NANOS)); } throw new UnsupportedOperationException("Unsupported type " + type); } + private boolean isUtcNormalized(Timestamp timestamp) { + String timeZone = timestamp.getTimezone(); + return timeZone != null && !timeZone.isEmpty(); + } + /** * See https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#interval */ @Override public TypeMapping visit(Interval type) { // TODO(PARQUET-675): fix interval original types - return primitiveFLBA(12, INTERVAL); + return primitiveFLBA(12, LogicalTypeAnnotation.IntervalLogicalTypeAnnotation.getInstance()); + } + + @Override + public TypeMapping visit(ArrowType.FixedSizeBinary fixedSizeBinary) { + return primitive(BINARY); } private TypeMapping mapping(PrimitiveType parquetType) { @@ -283,18 +290,18 @@ private TypeMapping mapping(PrimitiveType parquetType) { } private TypeMapping decimal(PrimitiveTypeName type, int precision, int scale) { - return mapping(Types.optional(type).as(DECIMAL).precision(precision).scale(scale).named(fieldName)); + return mapping(Types.optional(type).as(decimalType(scale, precision)).named(fieldName)); } private TypeMapping primitive(PrimitiveTypeName type) { return mapping(Types.optional(type).named(fieldName)); } - private TypeMapping primitive(PrimitiveTypeName type, OriginalType otype) { + private TypeMapping primitive(PrimitiveTypeName type, LogicalTypeAnnotation otype) { return mapping(Types.optional(type).as(otype).named(fieldName)); } - private TypeMapping primitiveFLBA(int length, OriginalType otype) { + private TypeMapping primitiveFLBA(int length, LogicalTypeAnnotation otype) { return mapping(Types.optional(FIXED_LEN_BYTE_ARRAY).length(length).as(otype).named(fieldName)); } }); @@ -358,21 +365,21 @@ private TypeMapping fromParquet(Type type, String name, Repetition repetition) { * @return the mapping */ private TypeMapping fromParquetGroup(GroupType type, String name) { - OriginalType ot = type.getOriginalType(); - if (ot == null) { + LogicalTypeAnnotation logicalType = type.getLogicalTypeAnnotation(); + if (logicalType == null) { List typeMappings = fromParquet(type.getFields()); Field arrowField = new Field(name, type.isRepetition(OPTIONAL), new Struct(), fields(typeMappings)); return new StructTypeMapping(arrowField, type, typeMappings); } else { - switch (ot) { - case LIST: + return logicalType.accept(new LogicalTypeAnnotation.LogicalTypeAnnotationVisitor() { + @Override + public Optional visit(LogicalTypeAnnotation.ListLogicalTypeAnnotation listLogicalType) { List3Levels list3Levels = new List3Levels(type); TypeMapping child = fromParquet(list3Levels.getElement(), null, list3Levels.getElement().getRepetition()); Field arrowField = new Field(name, type.isRepetition(OPTIONAL), new ArrowType.List(), asList(child.getArrowField())); - return new ListTypeMapping(arrowField, list3Levels, child); - default: - throw new UnsupportedOperationException("Unsupported type " + type); - } + return of(new ListTypeMapping(arrowField, list3Levels, child)); + } + }).orElseThrow(() -> new UnsupportedOperationException("Unsupported type " + type)); } } @@ -401,92 +408,86 @@ public TypeMapping convertDOUBLE(PrimitiveTypeName primitiveTypeName) throws Run @Override public TypeMapping convertINT32(PrimitiveTypeName primitiveTypeName) throws RuntimeException { - OriginalType ot = type.getOriginalType(); - if (ot == null) { + LogicalTypeAnnotation logicalTypeAnnotation = type.getLogicalTypeAnnotation(); + if (logicalTypeAnnotation == null) { return integer(32, true); } - switch (ot) { - case INT_8: - return integer(8, true); - case INT_16: - return integer(16, true); - case INT_32: - return integer(32, true); - case UINT_8: - return integer(8, false); - case UINT_16: - return integer(16, false); - case UINT_32: - return integer(32, false); - case DECIMAL: - return decimal(type.getDecimalMetadata()); - case DATE: - return field(new ArrowType.Date(DateUnit.DAY)); - case TIME_MILLIS: - return field(new ArrowType.Time(TimeUnit.MILLISECOND, 32)); - default: - case INT_64: - case UINT_64: - case UTF8: - case ENUM: - case BSON: - case INTERVAL: - case JSON: - case LIST: - case MAP: - case MAP_KEY_VALUE: - case TIMESTAMP_MICROS: - case TIMESTAMP_MILLIS: - case TIME_MICROS: - throw new IllegalArgumentException("illegal type " + type); - } + return logicalTypeAnnotation.accept(new LogicalTypeAnnotation.LogicalTypeAnnotationVisitor() { + @Override + public Optional visit(LogicalTypeAnnotation.DecimalLogicalTypeAnnotation decimalLogicalType) { + return of(decimal(decimalLogicalType.getPrecision(), decimalLogicalType.getScale())); + } + + @Override + public Optional visit(LogicalTypeAnnotation.DateLogicalTypeAnnotation dateLogicalType) { + return of(field(new ArrowType.Date(DateUnit.DAY))); + } + + @Override + public Optional visit(LogicalTypeAnnotation.TimeLogicalTypeAnnotation timeLogicalType) { + return timeLogicalType.getUnit() == MILLIS ? of(field(new ArrowType.Time(TimeUnit.MILLISECOND, 32))) : empty(); + } + + @Override + public Optional visit(LogicalTypeAnnotation.IntLogicalTypeAnnotation intLogicalType) { + if (intLogicalType.getBitWidth() == 64) { + return empty(); + } + return of(integer(intLogicalType.getBitWidth(), intLogicalType.isSigned())); + } + }).orElseThrow(() -> new IllegalArgumentException("illegal type " + type)); } @Override public TypeMapping convertINT64(PrimitiveTypeName primitiveTypeName) throws RuntimeException { - OriginalType ot = type.getOriginalType(); - if (ot == null) { + LogicalTypeAnnotation logicalTypeAnnotation = type.getLogicalTypeAnnotation(); + if (logicalTypeAnnotation == null) { return integer(64, true); } - switch (ot) { - case INT_8: - return integer(8, true); - case INT_16: - return integer(16, true); - case INT_32: - return integer(32, true); - case INT_64: - return integer(64, true); - case UINT_8: - return integer(8, false); - case UINT_16: - return integer(16, false); - case UINT_32: - return integer(32, false); - case UINT_64: - return integer(64, false); - case DECIMAL: - return decimal(type.getDecimalMetadata()); - case DATE: - return field(new ArrowType.Date(DateUnit.DAY)); - case TIMESTAMP_MICROS: - return field(new ArrowType.Timestamp(TimeUnit.MICROSECOND, "UTC")); - case TIMESTAMP_MILLIS: - return field(new ArrowType.Timestamp(TimeUnit.MILLISECOND, "UTC")); - case TIME_MICROS: - return field(new ArrowType.Time(TimeUnit.MICROSECOND, 64)); - default: - case UTF8: - case ENUM: - case BSON: - case INTERVAL: - case JSON: - case LIST: - case MAP: - case MAP_KEY_VALUE: - case TIME_MILLIS: - throw new IllegalArgumentException("illegal type " + type); - } + + return logicalTypeAnnotation.accept(new LogicalTypeAnnotation.LogicalTypeAnnotationVisitor() { + @Override + public Optional visit(LogicalTypeAnnotation.DateLogicalTypeAnnotation dateLogicalType) { + return of(field(new ArrowType.Date(DateUnit.DAY))); + } + + @Override + public Optional visit(LogicalTypeAnnotation.DecimalLogicalTypeAnnotation decimalLogicalType) { + return of(decimal(decimalLogicalType.getPrecision(), decimalLogicalType.getScale())); + } + + @Override + public Optional visit(LogicalTypeAnnotation.IntLogicalTypeAnnotation intLogicalType) { + return of(integer(intLogicalType.getBitWidth(), intLogicalType.isSigned())); + } + + @Override + public Optional visit(LogicalTypeAnnotation.TimeLogicalTypeAnnotation timeLogicalType) { + if (timeLogicalType.getUnit() == MICROS) { + return of(field(new ArrowType.Time(TimeUnit.MICROSECOND, 64))); + } else if (timeLogicalType.getUnit() == NANOS) { + return of(field(new ArrowType.Time(TimeUnit.NANOSECOND, 64))); + } + return empty(); + } + + @Override + public Optional visit(LogicalTypeAnnotation.TimestampLogicalTypeAnnotation timestampLogicalType) { + switch (timestampLogicalType.getUnit()) { + case MICROS: + return of(field(new ArrowType.Timestamp(TimeUnit.MICROSECOND, getTimeZone(timestampLogicalType)))); + case MILLIS: + return of(field(new ArrowType.Timestamp(TimeUnit.MILLISECOND, getTimeZone(timestampLogicalType)))); + case NANOS: + return of(field(new ArrowType.Timestamp(TimeUnit.NANOSECOND, getTimeZone(timestampLogicalType)))); + } + return empty(); + } + + private String getTimeZone(LogicalTypeAnnotation.TimestampLogicalTypeAnnotation timestampLogicalType) { + return timestampLogicalType.isAdjustedToUTC() ? "UTC" : null; + } + }).orElseThrow(() -> new IllegalArgumentException("illegal type " + type)); } @Override @@ -507,22 +508,25 @@ public TypeMapping convertBOOLEAN(PrimitiveTypeName primitiveTypeName) throws Ru @Override public TypeMapping convertBINARY(PrimitiveTypeName primitiveTypeName) throws RuntimeException { - OriginalType ot = type.getOriginalType(); - if (ot == null) { + LogicalTypeAnnotation logicalTypeAnnotation = type.getLogicalTypeAnnotation(); + if (logicalTypeAnnotation == null) { return field(new ArrowType.Binary()); } - switch (ot) { - case UTF8: - return field(new ArrowType.Utf8()); - case DECIMAL: - return decimal(type.getDecimalMetadata()); - default: - throw new IllegalArgumentException("illegal type " + type); - } + return logicalTypeAnnotation.accept(new LogicalTypeAnnotation.LogicalTypeAnnotationVisitor() { + @Override + public Optional visit(LogicalTypeAnnotation.StringLogicalTypeAnnotation stringLogicalType) { + return of(field(new ArrowType.Utf8())); + } + + @Override + public Optional visit(LogicalTypeAnnotation.DecimalLogicalTypeAnnotation decimalLogicalType) { + return of(decimal(decimalLogicalType.getPrecision(), decimalLogicalType.getScale())); + } + }).orElseThrow(() -> new IllegalArgumentException("illegal type " + type)); } - private TypeMapping decimal(DecimalMetadata decimalMetadata) { - return field(new ArrowType.Decimal(decimalMetadata.getPrecision(), decimalMetadata.getScale())); + private TypeMapping decimal(int precision, int scale) { + return field(new ArrowType.Decimal(precision, scale)); } private TypeMapping integer(int width, boolean signed) { @@ -663,6 +667,11 @@ public TypeMapping visit(Interval type) { return primitive(); } + @Override + public TypeMapping visit(ArrowType.FixedSizeBinary fixedSizeBinary) { + return primitive(); + } + private TypeMapping primitive() { if (!parquetField.isPrimitive()) { throw new IllegalArgumentException("Can not map schemas as one is primitive and the other is not: " + arrowField + " != " + parquetField); diff --git a/parquet-arrow/src/test/java/org/apache/parquet/arrow/schema/TestSchemaConverter.java b/parquet-arrow/src/test/java/org/apache/parquet/arrow/schema/TestSchemaConverter.java index 2d1f028e24..c962b5456f 100644 --- a/parquet-arrow/src/test/java/org/apache/parquet/arrow/schema/TestSchemaConverter.java +++ b/parquet-arrow/src/test/java/org/apache/parquet/arrow/schema/TestSchemaConverter.java @@ -19,6 +19,11 @@ package org.apache.parquet.arrow.schema; import static java.util.Arrays.asList; +import static org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit.MICROS; +import static org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit.MILLIS; +import static org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit.NANOS; +import static org.apache.parquet.schema.LogicalTypeAnnotation.timeType; +import static org.apache.parquet.schema.LogicalTypeAnnotation.timestampType; import static org.apache.parquet.schema.OriginalType.DATE; import static org.apache.parquet.schema.OriginalType.DECIMAL; import static org.apache.parquet.schema.OriginalType.INTERVAL; @@ -62,12 +67,12 @@ import org.apache.parquet.arrow.schema.SchemaMapping.TypeMappingVisitor; import org.apache.parquet.arrow.schema.SchemaMapping.UnionTypeMapping; import org.apache.parquet.example.Paper; +import org.apache.parquet.schema.LogicalTypeAnnotation; import org.apache.parquet.schema.MessageType; import org.apache.parquet.schema.Types; +import org.junit.Assert; import org.junit.Test; -import junit.framework.Assert; - /** * @see SchemaConverter */ @@ -90,7 +95,11 @@ private static Field field(String name, ArrowType type, Field... children) { field("f", new ArrowType.FixedSizeList(1), field(null, new ArrowType.Date(DateUnit.DAY))), field("g", new ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE)), field("h", new ArrowType.Timestamp(TimeUnit.MILLISECOND, "UTC")), - field("i", new ArrowType.Interval(IntervalUnit.DAY_TIME)) + field("i", new ArrowType.Timestamp(TimeUnit.NANOSECOND, "UTC")), + field("j", new ArrowType.Timestamp(TimeUnit.MILLISECOND, null)), + field("k", new ArrowType.Timestamp(TimeUnit.MICROSECOND, "UTC")), + field("l", new ArrowType.Timestamp(TimeUnit.MICROSECOND, null)), + field("m", new ArrowType.Interval(IntervalUnit.DAY_TIME)) )); private final MessageType complexParquetSchema = Types.buildMessage() .addField(Types.optional(INT32).as(INT_8).named("a")) @@ -105,8 +114,12 @@ private static Field field(String name, ArrowType type, Field... children) { setElementType(Types.optional(INT32).as(DATE).named("element")) .named("f")) .addField(Types.optional(FLOAT).named("g")) - .addField(Types.optional(INT64).as(TIMESTAMP_MILLIS).named("h")) - .addField(Types.optional(FIXED_LEN_BYTE_ARRAY).length(12).as(INTERVAL).named("i")) + .addField(Types.optional(INT64).as(timestampType(true, MILLIS)).named("h")) + .addField(Types.optional(INT64).as(timestampType(true, NANOS)).named("i")) + .addField(Types.optional(INT64).as(timestampType(false, MILLIS)).named("j")) + .addField(Types.optional(INT64).as(timestampType(true, MICROS)).named("k")) + .addField(Types.optional(INT64).as(timestampType(false, MICROS)).named("l")) + .addField(Types.optional(FIXED_LEN_BYTE_ARRAY).length(12).as(INTERVAL).named("m")) .named("root"); private final Schema allTypesArrowSchema = new Schema(asList( @@ -135,8 +148,10 @@ private static Field field(String name, ArrowType type, Field... children) { field("m", new ArrowType.Time(TimeUnit.MILLISECOND, 32)), field("n", new ArrowType.Timestamp(TimeUnit.MILLISECOND, "UTC")), field("o", new ArrowType.Interval(IntervalUnit.DAY_TIME)), - field("o1", new ArrowType.Interval(IntervalUnit.YEAR_MONTH)) - )); + field("o1", new ArrowType.Interval(IntervalUnit.YEAR_MONTH)), + field("p", new ArrowType.Time(TimeUnit.NANOSECOND, 64)), + field("q", new ArrowType.Timestamp(TimeUnit.NANOSECOND, "UTC")) + )); private final MessageType allTypesParquetSchema = Types.buildMessage() .addField(Types.optional(BINARY).named("a")) @@ -169,10 +184,12 @@ private static Field field(String name, ArrowType type, Field... children) { .addField(Types.optional(INT64).as(DECIMAL).precision(15).scale(5).named("k1")) .addField(Types.optional(BINARY).as(DECIMAL).precision(25).scale(5).named("k2")) .addField(Types.optional(INT32).as(DATE).named("l")) - .addField(Types.optional(INT32).as(TIME_MILLIS).named("m")) + .addField(Types.optional(INT32).as(timeType(false, MILLIS)).named("m")) .addField(Types.optional(INT64).as(TIMESTAMP_MILLIS).named("n")) .addField(Types.optional(FIXED_LEN_BYTE_ARRAY).length(12).as(INTERVAL).named("o")) .addField(Types.optional(FIXED_LEN_BYTE_ARRAY).length(12).as(INTERVAL).named("o1")) + .addField(Types.optional(INT64).as(timeType(false, NANOS)).named("p")) + .addField(Types.optional(INT64).as(timestampType(true, NANOS)).named("q")) .named("root"); private final Schema supportedTypesArrowSchema = new Schema(asList( @@ -196,7 +213,9 @@ private static Field field(String name, ArrowType type, Field... children) { field("j2", new ArrowType.Decimal(25, 5)), field("k", new ArrowType.Date(DateUnit.DAY)), field("l", new ArrowType.Time(TimeUnit.MILLISECOND, 32)), - field("m", new ArrowType.Timestamp(TimeUnit.MILLISECOND, "UTC")) + field("m", new ArrowType.Timestamp(TimeUnit.MILLISECOND, "UTC")), + field("n", new ArrowType.Time(TimeUnit.NANOSECOND, 64)), + field("o", new ArrowType.Timestamp(TimeUnit.NANOSECOND, "UTC")) )); private final MessageType supportedTypesParquetSchema = Types.buildMessage() @@ -225,6 +244,8 @@ private static Field field(String name, ArrowType type, Field... children) { .addField(Types.optional(INT32).as(DATE).named("k")) .addField(Types.optional(INT32).as(TIME_MILLIS).named("l")) .addField(Types.optional(INT64).as(TIMESTAMP_MILLIS).named("m")) + .addField(Types.optional(INT64).as(timeType(true, NANOS)).named("n")) + .addField(Types.optional(INT64).as(timestampType(true, NANOS)).named("o")) .named("root"); private final Schema paperArrowSchema = new Schema(asList( @@ -298,7 +319,7 @@ private void compareFields(List left, List right) { @Test public void testAllMap() throws IOException { SchemaMapping map = converter.map(allTypesArrowSchema, allTypesParquetSchema); - Assert.assertEquals("p, s

, l

, l

, u

, p, p, p, p, p, p, p, p, p, p, p, p, p, p, p, p, p, p, p, p, p", toSummaryString(map)); + Assert.assertEquals("p, s

, l

, l

, u

, p, p, p, p, p, p, p, p, p, p, p, p, p, p, p, p, p, p, p, p, p, p, p", toSummaryString(map)); } private String toSummaryString(SchemaMapping map) { @@ -365,7 +386,8 @@ public void testArrowTimeMillisecondToParquet() { MessageType expected = converter.fromArrow(new Schema(asList( field("a", new ArrowType.Time(TimeUnit.MILLISECOND, 32)) ))).getParquetSchema(); - Assert.assertEquals(expected, Types.buildMessage().addField(Types.optional(INT32).as(TIME_MILLIS).named("a")).named("root")); + Assert.assertEquals(expected, + Types.buildMessage().addField(Types.optional(INT32).as(timeType(false, MILLIS)).named("a")).named("root")); } @Test @@ -373,14 +395,8 @@ public void testArrowTimeMicrosecondToParquet() { MessageType expected = converter.fromArrow(new Schema(asList( field("a", new ArrowType.Time(TimeUnit.MICROSECOND, 64)) ))).getParquetSchema(); - Assert.assertEquals(expected, Types.buildMessage().addField(Types.optional(INT64).as(TIME_MICROS).named("a")).named("root")); - } - - @Test(expected = UnsupportedOperationException.class) - public void testArrowTimeNanosecondToParquet() { - converter.fromArrow(new Schema(asList( - field("a", new ArrowType.Time(TimeUnit.NANOSECOND, 64)) - ))).getParquetSchema(); + Assert.assertEquals(expected, + Types.buildMessage().addField(Types.optional(INT64).as(timeType(false, MICROS)).named("a")).named("root")); } @Test @@ -438,13 +454,6 @@ public void testArrowTimestampMicrosecondToParquet() { Assert.assertEquals(expected, Types.buildMessage().addField(Types.optional(INT64).as(TIMESTAMP_MICROS).named("a")).named("root")); } - @Test(expected = UnsupportedOperationException.class) - public void testArrowTimestampNanosecondToParquet() { - converter.fromArrow(new Schema(asList( - field("a", new ArrowType.Timestamp(TimeUnit.NANOSECOND, "UTC")) - ))).getParquetSchema(); - } - @Test public void testParquetInt64TimestampMillisToArrow() { MessageType parquet = Types.buildMessage() diff --git a/parquet-avro/pom.xml b/parquet-avro/pom.xml index 3592121d76..bc3603fe62 100644 --- a/parquet-avro/pom.xml +++ b/parquet-avro/pom.xml @@ -45,8 +45,8 @@ org.apache.parquet - parquet-format - ${parquet.format.version} + parquet-format-structures + ${project.version} org.apache.avro diff --git a/parquet-avro/src/main/java/org/apache/parquet/avro/AvroSchemaConverter.java b/parquet-avro/src/main/java/org/apache/parquet/avro/AvroSchemaConverter.java index 1bb12b9835..558446e6ba 100644 --- a/parquet-avro/src/main/java/org/apache/parquet/avro/AvroSchemaConverter.java +++ b/parquet-avro/src/main/java/org/apache/parquet/avro/AvroSchemaConverter.java @@ -24,10 +24,9 @@ import org.apache.hadoop.conf.Configuration; import org.apache.parquet.schema.ConversionPatterns; -import org.apache.parquet.schema.DecimalMetadata; import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.LogicalTypeAnnotation; import org.apache.parquet.schema.MessageType; -import org.apache.parquet.schema.OriginalType; import org.apache.parquet.schema.PrimitiveType; import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; import org.apache.parquet.schema.Type; @@ -36,11 +35,21 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.List; +import java.util.Optional; +import static java.util.Optional.empty; +import static java.util.Optional.of; import static org.apache.avro.JsonProperties.NULL_VALUE; import static org.apache.parquet.avro.AvroWriteSupport.WRITE_OLD_LIST_STRUCTURE; import static org.apache.parquet.avro.AvroWriteSupport.WRITE_OLD_LIST_STRUCTURE_DEFAULT; -import static org.apache.parquet.schema.OriginalType.*; +import static org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit.MICROS; +import static org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit.MILLIS; +import static org.apache.parquet.schema.LogicalTypeAnnotation.dateType; +import static org.apache.parquet.schema.LogicalTypeAnnotation.decimalType; +import static org.apache.parquet.schema.LogicalTypeAnnotation.enumType; +import static org.apache.parquet.schema.LogicalTypeAnnotation.stringType; +import static org.apache.parquet.schema.LogicalTypeAnnotation.timeType; +import static org.apache.parquet.schema.LogicalTypeAnnotation.timestampType; import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.*; import static org.apache.parquet.schema.Type.Repetition.REPEATED; @@ -147,11 +156,11 @@ private Type convertField(String fieldName, Schema schema, Type.Repetition repet } else if (type.equals(Schema.Type.BYTES)) { builder = Types.primitive(BINARY, repetition); } else if (type.equals(Schema.Type.STRING)) { - builder = Types.primitive(BINARY, repetition).as(UTF8); + builder = Types.primitive(BINARY, repetition).as(stringType()); } else if (type.equals(Schema.Type.RECORD)) { return new GroupType(repetition, fieldName, convertFields(schema.getFields())); } else if (type.equals(Schema.Type.ENUM)) { - builder = Types.primitive(BINARY, repetition).as(ENUM); + builder = Types.primitive(BINARY, repetition).as(enumType()); } else if (type.equals(Schema.Type.ARRAY)) { if (writeOldListStructure) { return ConversionPatterns.listType(repetition, fieldName, @@ -178,12 +187,10 @@ private Type convertField(String fieldName, Schema schema, Type.Repetition repet LogicalType logicalType = schema.getLogicalType(); if (logicalType != null) { if (logicalType instanceof LogicalTypes.Decimal) { - builder = builder.as(DECIMAL) - .precision(((LogicalTypes.Decimal) logicalType).getPrecision()) - .scale(((LogicalTypes.Decimal) logicalType).getScale()); - + LogicalTypes.Decimal decimal = (LogicalTypes.Decimal) logicalType; + builder = builder.as(decimalType(decimal.getScale(), decimal.getPrecision())); } else { - OriginalType annotation = convertLogicalType(logicalType); + LogicalTypeAnnotation annotation = convertLogicalType(logicalType); if (annotation != null) { builder.as(annotation); } @@ -267,7 +274,7 @@ private Schema convertField(final Type parquetType) { final PrimitiveType asPrimitive = parquetType.asPrimitiveType(); final PrimitiveTypeName parquetPrimitiveTypeName = asPrimitive.getPrimitiveTypeName(); - final OriginalType annotation = parquetType.getOriginalType(); + final LogicalTypeAnnotation annotation = parquetType.getLogicalTypeAnnotation(); Schema schema = parquetPrimitiveTypeName.convert( new PrimitiveType.PrimitiveTypeNameConverter() { @Override @@ -301,7 +308,8 @@ public Schema convertFIXED_LEN_BYTE_ARRAY(PrimitiveTypeName primitiveTypeName) { } @Override public Schema convertBINARY(PrimitiveTypeName primitiveTypeName) { - if (annotation == OriginalType.UTF8 || annotation == OriginalType.ENUM) { + if (annotation instanceof LogicalTypeAnnotation.StringLogicalTypeAnnotation || + annotation instanceof LogicalTypeAnnotation.EnumLogicalTypeAnnotation) { return Schema.create(Schema.Type.STRING); } else { return Schema.create(Schema.Type.BYTES); @@ -309,9 +317,8 @@ public Schema convertBINARY(PrimitiveTypeName primitiveTypeName) { } }); - LogicalType logicalType = convertOriginalType( - annotation, asPrimitive.getDecimalMetadata()); - if (logicalType != null && (annotation != DECIMAL || + LogicalType logicalType = convertLogicalType(annotation); + if (logicalType != null && (!(annotation instanceof LogicalTypeAnnotation.DecimalLogicalTypeAnnotation) || parquetPrimitiveTypeName == BINARY || parquetPrimitiveTypeName == FIXED_LEN_BYTE_ARRAY)) { schema = logicalType.addToSchema(schema); @@ -321,10 +328,11 @@ public Schema convertBINARY(PrimitiveTypeName primitiveTypeName) { } else { GroupType parquetGroupType = parquetType.asGroupType(); - OriginalType originalType = parquetGroupType.getOriginalType(); - if (originalType != null) { - switch(originalType) { - case LIST: + LogicalTypeAnnotation logicalTypeAnnotation = parquetGroupType.getLogicalTypeAnnotation(); + if (logicalTypeAnnotation != null) { + return logicalTypeAnnotation.accept(new LogicalTypeAnnotation.LogicalTypeAnnotationVisitor() { + @Override + public Optional visit(LogicalTypeAnnotation.ListLogicalTypeAnnotation listLogicalType) { if (parquetGroupType.getFieldCount()!= 1) { throw new UnsupportedOperationException("Invalid list type " + parquetGroupType); } @@ -334,17 +342,29 @@ public Schema convertBINARY(PrimitiveTypeName primitiveTypeName) { } if (isElementType(repeatedType, parquetGroupType.getName())) { // repeated element types are always required - return Schema.createArray(convertField(repeatedType)); + return of(Schema.createArray(convertField(repeatedType))); } else { Type elementType = repeatedType.asGroupType().getType(0); if (elementType.isRepetition(Type.Repetition.OPTIONAL)) { - return Schema.createArray(optional(convertField(elementType))); + return of(Schema.createArray(optional(convertField(elementType)))); } else { - return Schema.createArray(convertField(elementType)); + return of(Schema.createArray(convertField(elementType))); } } - case MAP_KEY_VALUE: // for backward-compatibility - case MAP: + } + + @Override + // for backward-compatibility + public Optional visit(LogicalTypeAnnotation.MapKeyValueTypeAnnotation mapKeyValueLogicalType) { + return visitMapOrMapKeyValue(); + } + + @Override + public Optional visit(LogicalTypeAnnotation.MapLogicalTypeAnnotation mapLogicalType) { + return visitMapOrMapKeyValue(); + } + + private Optional visitMapOrMapKeyValue() { if (parquetGroupType.getFieldCount() != 1 || parquetGroupType.getType(0).isPrimitive()) { throw new UnsupportedOperationException("Invalid map type " + parquetGroupType); } @@ -356,24 +376,23 @@ public Schema convertBINARY(PrimitiveTypeName primitiveTypeName) { Type keyType = mapKeyValType.getType(0); if (!keyType.isPrimitive() || !keyType.asPrimitiveType().getPrimitiveTypeName().equals(PrimitiveTypeName.BINARY) || - !keyType.getOriginalType().equals(OriginalType.UTF8)) { + !keyType.getLogicalTypeAnnotation().equals(stringType())) { throw new IllegalArgumentException("Map key type must be binary (UTF8): " + keyType); } Type valueType = mapKeyValType.getType(1); if (valueType.isRepetition(Type.Repetition.OPTIONAL)) { - return Schema.createMap(optional(convertField(valueType))); + return of(Schema.createMap(optional(convertField(valueType)))); } else { - return Schema.createMap(convertField(valueType)); + return of(Schema.createMap(convertField(valueType))); } - case ENUM: - return Schema.create(Schema.Type.STRING); - case UTF8: - default: - throw new UnsupportedOperationException("Cannot convert Parquet type " + - parquetType); + } - } + @Override + public Optional visit(LogicalTypeAnnotation.EnumLogicalTypeAnnotation enumLogicalType) { + return of(Schema.create(Schema.Type.STRING)); + } + }).orElseThrow(() -> new UnsupportedOperationException("Cannot convert Parquet type " + parquetType)); } else { // if no original type then it's a record return convertFields(parquetGroupType.getName(), parquetGroupType.getFields()); @@ -381,44 +400,65 @@ public Schema convertBINARY(PrimitiveTypeName primitiveTypeName) { } } - private OriginalType convertLogicalType(LogicalType logicalType) { + private LogicalTypeAnnotation convertLogicalType(LogicalType logicalType) { if (logicalType == null) { return null; } else if (logicalType instanceof LogicalTypes.Decimal) { - return OriginalType.DECIMAL; + LogicalTypes.Decimal decimal = (LogicalTypes.Decimal) logicalType; + return decimalType(decimal.getScale(), decimal.getPrecision()); } else if (logicalType instanceof LogicalTypes.Date) { - return OriginalType.DATE; + return dateType(); } else if (logicalType instanceof LogicalTypes.TimeMillis) { - return OriginalType.TIME_MILLIS; + return timeType(true, MILLIS); } else if (logicalType instanceof LogicalTypes.TimeMicros) { - return OriginalType.TIME_MICROS; + return timeType(true, MICROS); } else if (logicalType instanceof LogicalTypes.TimestampMillis) { - return OriginalType.TIMESTAMP_MILLIS; + return timestampType(true, MILLIS); } else if (logicalType instanceof LogicalTypes.TimestampMicros) { - return OriginalType.TIMESTAMP_MICROS; + return timestampType(true, MICROS); } return null; } - private LogicalType convertOriginalType(OriginalType annotation, DecimalMetadata meta) { + private LogicalType convertLogicalType(LogicalTypeAnnotation annotation) { if (annotation == null) { return null; } - switch (annotation) { - case DECIMAL: - return LogicalTypes.decimal(meta.getPrecision(), meta.getScale()); - case DATE: - return LogicalTypes.date(); - case TIME_MILLIS: - return LogicalTypes.timeMillis(); - case TIME_MICROS: - return LogicalTypes.timeMicros(); - case TIMESTAMP_MILLIS: - return LogicalTypes.timestampMillis(); - case TIMESTAMP_MICROS: - return LogicalTypes.timestampMicros(); - } - return null; + return annotation.accept(new LogicalTypeAnnotation.LogicalTypeAnnotationVisitor() { + @Override + public Optional visit(LogicalTypeAnnotation.DecimalLogicalTypeAnnotation decimalLogicalType) { + return of(LogicalTypes.decimal(decimalLogicalType.getPrecision(), decimalLogicalType.getScale())); + } + + @Override + public Optional visit(LogicalTypeAnnotation.DateLogicalTypeAnnotation dateLogicalType) { + return of(LogicalTypes.date()); + } + + @Override + public Optional visit(LogicalTypeAnnotation.TimeLogicalTypeAnnotation timeLogicalType) { + LogicalTypeAnnotation.TimeUnit unit = timeLogicalType.getUnit(); + switch (unit) { + case MILLIS: + return of(LogicalTypes.timeMillis()); + case MICROS: + return of(LogicalTypes.timeMicros()); + } + return empty(); + } + + @Override + public Optional visit(LogicalTypeAnnotation.TimestampLogicalTypeAnnotation timestampLogicalType) { + LogicalTypeAnnotation.TimeUnit unit = timestampLogicalType.getUnit(); + switch (unit) { + case MILLIS: + return of(LogicalTypes.timestampMillis()); + case MICROS: + return of(LogicalTypes.timestampMicros()); + } + return empty(); + } + }).orElse(null); } /** diff --git a/parquet-avro/src/test/java/org/apache/parquet/avro/TestAvroSchemaConverter.java b/parquet-avro/src/test/java/org/apache/parquet/avro/TestAvroSchemaConverter.java index 942e3b1378..bfaeec3d6b 100644 --- a/parquet-avro/src/test/java/org/apache/parquet/avro/TestAvroSchemaConverter.java +++ b/parquet-avro/src/test/java/org/apache/parquet/avro/TestAvroSchemaConverter.java @@ -1,4 +1,4 @@ -/* +/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -6,9 +6,9 @@ * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -616,7 +616,7 @@ public void testTimeMillisType() throws Exception { testRoundTripConversion(expected, "message myrecord {\n" + - " required int32 time (TIME_MILLIS);\n" + + " required int32 time (TIME(MILLIS,true));\n" + "}\n"); for (PrimitiveTypeName primitive : new PrimitiveTypeName[] @@ -646,7 +646,7 @@ public void testTimeMicrosType() throws Exception { testRoundTripConversion(expected, "message myrecord {\n" + - " required int64 time (TIME_MICROS);\n" + + " required int64 time (TIME(MICROS,true));\n" + "}\n"); for (PrimitiveTypeName primitive : new PrimitiveTypeName[] @@ -676,7 +676,7 @@ public void testTimestampMillisType() throws Exception { testRoundTripConversion(expected, "message myrecord {\n" + - " required int64 timestamp (TIMESTAMP_MILLIS);\n" + + " required int64 timestamp (TIMESTAMP(MILLIS,true));\n" + "}\n"); for (PrimitiveTypeName primitive : new PrimitiveTypeName[] @@ -706,7 +706,7 @@ public void testTimestampMicrosType() throws Exception { testRoundTripConversion(expected, "message myrecord {\n" + - " required int64 timestamp (TIMESTAMP_MICROS);\n" + + " required int64 timestamp (TIMESTAMP(MICROS,true));\n" + "}\n"); for (PrimitiveTypeName primitive : new PrimitiveTypeName[] diff --git a/parquet-cascading-common23/src/main/java/org/apache/parquet/cascading/convert/TupleConverter.java b/parquet-cascading-common23/src/main/java/org/apache/parquet/cascading/convert/TupleConverter.java index 3741165b09..4c1240b859 100644 --- a/parquet-cascading-common23/src/main/java/org/apache/parquet/cascading/convert/TupleConverter.java +++ b/parquet-cascading-common23/src/main/java/org/apache/parquet/cascading/convert/TupleConverter.java @@ -1,4 +1,4 @@ -/* +/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -6,9 +6,9 @@ * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -27,10 +27,7 @@ import org.apache.parquet.io.api.PrimitiveConverter; import org.apache.parquet.pig.TupleConversionException; import org.apache.parquet.schema.GroupType; -import org.apache.parquet.schema.OriginalType; -import org.apache.parquet.schema.PrimitiveType; import org.apache.parquet.schema.Type; -import org.apache.parquet.schema.Type.Repetition; public class TupleConverter extends GroupConverter { diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/Main.java b/parquet-cli/src/main/java/org/apache/parquet/cli/Main.java index 990193c731..fa69ce7a40 100644 --- a/parquet-cli/src/main/java/org/apache/parquet/cli/Main.java +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/Main.java @@ -32,6 +32,7 @@ import org.apache.parquet.cli.commands.ConvertCommand; import org.apache.parquet.cli.commands.ParquetMetadataCommand; import org.apache.parquet.cli.commands.SchemaCommand; +import org.apache.parquet.cli.commands.ShowColumnIndexCommand; import org.apache.parquet.cli.commands.ShowDictionaryCommand; import org.apache.parquet.cli.commands.ShowPagesCommand; import org.apache.parquet.cli.commands.ToAvroCommand; @@ -87,6 +88,7 @@ public class Main extends Configured implements Tool { jc.addCommand("to-avro", new ToAvroCommand(console)); jc.addCommand("cat", new CatCommand(console, 0)); jc.addCommand("head", new CatCommand(console, 10)); + jc.addCommand("column-index", new ShowColumnIndexCommand(console)); } @Override diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/Util.java b/parquet-cli/src/main/java/org/apache/parquet/cli/Util.java index 98bc1e5112..961c7f0c44 100644 --- a/parquet-cli/src/main/java/org/apache/parquet/cli/Util.java +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/Util.java @@ -80,7 +80,12 @@ public static String humanReadable(long bytes) { } } + @Deprecated public static String minMaxAsString(Statistics stats, OriginalType annotation) { + return minMaxAsString(stats); + } + + public static String minMaxAsString(Statistics stats) { if (stats == null) { return "no stats"; } @@ -90,7 +95,12 @@ public static String minMaxAsString(Statistics stats, OriginalType annotation) { return String.format("%s / %s", humanReadable(stats.minAsString(), 30), humanReadable(stats.maxAsString(), 30)); } + @Deprecated public static String toString(Statistics stats, long count, OriginalType annotation) { + return toString(stats, count); + } + + public static String toString(Statistics stats, long count) { if (stats == null) { return "no stats"; } diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ParquetMetadataCommand.java b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ParquetMetadataCommand.java index 54fe6579b9..a452369e26 100644 --- a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ParquetMetadataCommand.java +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ParquetMetadataCommand.java @@ -169,12 +169,12 @@ private void printColumnChunk(Logger console, int width, ColumnChunkMetaData col console.info(String.format("%-" + width + "s FIXED[%d] %s %-7s %-9d %-8s %-7s %s", name, type.getTypeLength(), shortCodec(codec), encodingSummary, count, humanReadable(perValue), stats == null || !stats.isNumNullsSet() ? "" : String.valueOf(stats.getNumNulls()), - minMaxAsString(stats, type.getOriginalType()))); + minMaxAsString(stats))); } else { console.info(String.format("%-" + width + "s %-9s %s %-7s %-9d %-10s %-7s %s", name, typeName, shortCodec(codec), encodingSummary, count, humanReadable(perValue), stats == null || !stats.isNumNullsSet() ? "" : String.valueOf(stats.getNumNulls()), - minMaxAsString(stats, type.getOriginalType()))); + minMaxAsString(stats))); } } } diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowColumnIndexCommand.java b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowColumnIndexCommand.java new file mode 100644 index 0000000000..38a7094b89 --- /dev/null +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowColumnIndexCommand.java @@ -0,0 +1,157 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.cli.commands; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.parquet.cli.BaseCommand; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.metadata.BlockMetaData; +import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; +import org.apache.parquet.hadoop.util.HadoopInputFile; +import org.apache.parquet.internal.column.columnindex.ColumnIndex; +import org.apache.parquet.internal.column.columnindex.OffsetIndex; +import org.apache.parquet.io.InputFile; +import org.slf4j.Logger; + +import com.beust.jcommander.Parameter; +import com.beust.jcommander.Parameters; +import com.google.common.base.Preconditions; +import com.google.common.collect.Lists; + +/** + * parquet-cli command to print column and offset indexes. + */ +@Parameters(commandDescription = "Prints the column and offset indexes of a Parquet file") +public class ShowColumnIndexCommand extends BaseCommand { + public ShowColumnIndexCommand(Logger console) { + super(console); + } + + @Parameter(description = "") + List files; + + @Parameter(names = { "-c", "--column" }, description = "Shows the column/offset indexes for the given column only") + List ColumnPaths; + + @Parameter(names = { "-r", + "--row-group" }, description = "Shows the column/offset indexes for the given row-groups only; " + + "row-groups are referenced by their indexes from 0") + List rowGroupIndexes; + + @Parameter(names = { "-i", "--column-index" }, description = "Shows the column indexes; " + + "active by default unless -o is used") + boolean showColumnIndex; + + @Parameter(names = { "-o", "--offset-index" }, description = "Shows the offset indexes; " + + "active by default unless -i is used") + boolean showOffsetIndex; + + @Override + public List getExamples() { + return Lists.newArrayList( + "# Show only column indexes for column 'col' from a Parquet file", + "-c col -i sample.parquet"); + } + + @Override + public int run() throws IOException { + Preconditions.checkArgument(files != null && files.size() >= 1, + "A Parquet file is required."); + Preconditions.checkArgument(files.size() == 1, + "Cannot process multiple Parquet files."); + + InputFile in = HadoopInputFile.fromPath(qualifiedPath(files.get(0)), getConf()); + if (!showColumnIndex && !showOffsetIndex) { + showColumnIndex = true; + showOffsetIndex = true; + } + + Set rowGroupIndexSet = new HashSet<>(); + if (rowGroupIndexes != null) { + rowGroupIndexSet.addAll(rowGroupIndexes); + } + + try (ParquetFileReader reader = ParquetFileReader.open(in)) { + boolean firstBlock = true; + int rowGroupIndex = 0; + for (BlockMetaData block : reader.getFooter().getBlocks()) { + if (!rowGroupIndexSet.isEmpty() && !rowGroupIndexSet.contains(Integer.toString(rowGroupIndex))) { + ++rowGroupIndex; + continue; + } + if (!firstBlock) { + console.info(""); + } + firstBlock = false; + console.info("row-group {}:", rowGroupIndex); + for (ColumnChunkMetaData column : getColumns(block)) { + String path = column.getPath().toDotString(); + if (showColumnIndex) { + console.info("column index for column {}:", path); + ColumnIndex columnIndex = reader.readColumnIndex(column); + if (columnIndex == null) { + console.info("NONE"); + } else { + console.info(columnIndex.toString()); + } + } + if (showOffsetIndex) { + console.info("offset index for column {}:", path); + OffsetIndex offsetIndex = reader.readOffsetIndex(column); + if (offsetIndex == null) { + console.info("NONE"); + } else { + console.info(offsetIndex.toString()); + } + } + } + ++rowGroupIndex; + } + } + return 0; + } + + private List getColumns(BlockMetaData block) { + List columns = block.getColumns(); + if (ColumnPaths == null || ColumnPaths.isEmpty()) { + return columns; + } + Map pathMap = new HashMap<>(); + for (ColumnChunkMetaData column : columns) { + pathMap.put(column.getPath().toDotString(), column); + } + + List filtered = new ArrayList<>(); + for (String path : ColumnPaths) { + ColumnChunkMetaData column = pathMap.get(path); + if (column != null) { + filtered.add(column); + } + } + return filtered; + } + +} diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowDictionaryCommand.java b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowDictionaryCommand.java index db427c9c74..20a694ff7f 100644 --- a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowDictionaryCommand.java +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowDictionaryCommand.java @@ -30,8 +30,8 @@ import org.apache.parquet.column.page.DictionaryPage; import org.apache.parquet.column.page.DictionaryPageReadStore; import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.schema.LogicalTypeAnnotation; import org.apache.parquet.schema.MessageType; -import org.apache.parquet.schema.OriginalType; import org.apache.parquet.schema.PrimitiveType; import org.slf4j.Logger; import java.io.IOException; @@ -81,7 +81,7 @@ public int run() throws IOException { for (int i = 0; i <= dict.getMaxId(); i += 1) { switch(type.getPrimitiveTypeName()) { case BINARY: - if (type.getOriginalType() == OriginalType.UTF8) { + if (type.getLogicalTypeAnnotation() instanceof LogicalTypeAnnotation.StringLogicalTypeAnnotation) { console.info("{}: {}", String.format("%6d", i), Util.humanReadable(dict.decodeToBinary(i).toStringUsingUTF8(), 70)); } else { diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowPagesCommand.java b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowPagesCommand.java index 4d0e2c9ba5..1ac03aad7a 100644 --- a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowPagesCommand.java +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowPagesCommand.java @@ -193,7 +193,7 @@ public String visit(DataPageV1 page) { int count = page.getValueCount(); String numNulls = page.getStatistics().isNumNullsSet() ? Long.toString(page.getStatistics().getNumNulls()) : ""; float perValue = ((float) totalSize) / count; - String minMax = minMaxAsString(page.getStatistics(), type.getOriginalType()); + String minMax = minMaxAsString(page.getStatistics()); return String.format("%3d-%-3d %-5s %s %-2s %-7d %-10s %-10s %-8s %-7s %s", rowGroupNum, pageNum, "data", shortCodec, enc, count, humanReadable(perValue), humanReadable(totalSize), "", numNulls, minMax); @@ -207,7 +207,7 @@ public String visit(DataPageV2 page) { int numRows = page.getRowCount(); int numNulls = page.getNullCount(); float perValue = ((float) totalSize) / count; - String minMax = minMaxAsString(page.getStatistics(), type.getOriginalType()); + String minMax = minMaxAsString(page.getStatistics()); String compression = (page.isCompressed() ? shortCodec : "_"); return String.format("%3d-%-3d %-5s %s %-2s %-7d %-10s %-10s %-8d %-7s %s", rowGroupNum, pageNum, "data", compression, enc, count, humanReadable(perValue), diff --git a/parquet-column/src/main/java/org/apache/parquet/column/ColumnReader.java b/parquet-column/src/main/java/org/apache/parquet/column/ColumnReader.java index 52d269ef06..6d93eeed5f 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/ColumnReader.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/ColumnReader.java @@ -41,7 +41,10 @@ public interface ColumnReader { /** * @return the totalCount of values to be consumed + * @deprecated will be removed in 2.0.0; Total values might not be able to be counted before reading the values (e.g. + * in case of column index based filtering) */ + @Deprecated long getTotalValueCount(); /** diff --git a/parquet-column/src/main/java/org/apache/parquet/column/ParquetProperties.java b/parquet-column/src/main/java/org/apache/parquet/column/ParquetProperties.java index f01888aed8..572c6c9c87 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/ParquetProperties.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/ParquetProperties.java @@ -18,25 +18,25 @@ */ package org.apache.parquet.column; -import static org.apache.parquet.bytes.BytesUtils.getWidthFromMaxInt; - -import java.util.HashMap; - import org.apache.parquet.Preconditions; import org.apache.parquet.bytes.ByteBufferAllocator; import org.apache.parquet.bytes.CapacityByteArrayOutputStream; import org.apache.parquet.bytes.HeapByteBufferAllocator; + +import static org.apache.parquet.bytes.BytesUtils.getWidthFromMaxInt; import org.apache.parquet.column.impl.ColumnWriteStoreV1; import org.apache.parquet.column.impl.ColumnWriteStoreV2; import org.apache.parquet.column.page.PageWriteStore; import org.apache.parquet.column.values.ValuesWriter; import org.apache.parquet.column.values.bitpacking.DevNullValuesWriter; import org.apache.parquet.column.values.factory.DefaultValuesWriterFactory; -import org.apache.parquet.column.values.factory.ValuesWriterFactory; import org.apache.parquet.column.values.rle.RunLengthBitPackingHybridEncoder; import org.apache.parquet.column.values.rle.RunLengthBitPackingHybridValuesWriter; +import org.apache.parquet.column.values.factory.ValuesWriterFactory; import org.apache.parquet.schema.MessageType; +import java.util.HashMap; + /** * This class represents all the configurable Parquet properties. */ @@ -49,8 +49,10 @@ public class ParquetProperties { public static final boolean DEFAULT_ESTIMATE_ROW_COUNT_FOR_PAGE_SIZE_CHECK = true; public static final int DEFAULT_MINIMUM_RECORD_COUNT_FOR_CHECK = 100; public static final int DEFAULT_MAXIMUM_RECORD_COUNT_FOR_CHECK = 10000; + public static final int DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH = 64; public static final boolean DEFAULT_BLOOM_FILTER_ENABLED = false; + public static final ValuesWriterFactory DEFAULT_VALUES_WRITER_FACTORY = new DefaultValuesWriterFactory(); private static final int MIN_SLAB_SIZE = 64; @@ -86,12 +88,14 @@ public static WriterVersion fromString(String name) { private final boolean estimateNextSizeCheck; private final ByteBufferAllocator allocator; private final ValuesWriterFactory valuesWriterFactory; + private final int columnIndexTruncateLength; private final boolean enableBloomFilter; - private final HashMap bloomFilterExpectValues; + private final HashMap bloomFilterExpectedDistinctNumbers; private ParquetProperties(WriterVersion writerVersion, int pageSize, int dictPageSize, boolean enableDict, int minRowCountForPageSizeCheck, int maxRowCountForPageSizeCheck, boolean estimateNextSizeCheck, ByteBufferAllocator allocator, - ValuesWriterFactory writerFactory, boolean enableBloomFilter, HashMap bloomFilterExpectValues) { + ValuesWriterFactory writerFactory, int columnIndexMinMaxTruncateLength, boolean enableBloomFilter, + HashMap bloomFilterExpectedDistinctNumber) { this.pageSizeThreshold = pageSize; this.initialSlabSize = CapacityByteArrayOutputStream .initialSlabSizeHeuristic(MIN_SLAB_SIZE, pageSizeThreshold, 10); @@ -102,9 +106,12 @@ private ParquetProperties(WriterVersion writerVersion, int pageSize, int dictPag this.maxRowCountForPageSizeCheck = maxRowCountForPageSizeCheck; this.estimateNextSizeCheck = estimateNextSizeCheck; this.allocator = allocator; - this.enableBloomFilter = enableBloomFilter; - this.bloomFilterExpectValues = bloomFilterExpectValues; + this.valuesWriterFactory = writerFactory; + this.columnIndexTruncateLength = columnIndexMinMaxTruncateLength; + + this.enableBloomFilter = enableBloomFilter; + this.bloomFilterExpectedDistinctNumbers = bloomFilterExpectedDistinctNumber; } public ValuesWriter newRepetitionLevelWriter(ColumnDescriptor path) { @@ -165,19 +172,11 @@ public ByteBufferAllocator getAllocator() { return allocator; } - public boolean isBloomFilterEnabled() { - return enableBloomFilter; - } - - public HashMap getBloomFilterExpectValues() { - return bloomFilterExpectValues; - } - public ColumnWriteStore newColumnWriteStore(MessageType schema, PageWriteStore pageStore) { switch (writerVersion) { case PARQUET_1_0: - return new ColumnWriteStoreV1(pageStore, this); + return new ColumnWriteStoreV1(schema, pageStore, this); case PARQUET_2_0: return new ColumnWriteStoreV2(schema, pageStore, this); default: @@ -197,10 +196,22 @@ public ValuesWriterFactory getValuesWriterFactory() { return valuesWriterFactory; } + public int getColumnIndexTruncateLength() { + return columnIndexTruncateLength; + } + public boolean estimateNextSizeCheck() { return estimateNextSizeCheck; } + public boolean isBloomFilterEnabled() { + return enableBloomFilter; + } + + public HashMap getBloomFilterExpectedDistinctNumbers() { + return bloomFilterExpectedDistinctNumbers; + } + public static Builder builder() { return new Builder(); } @@ -213,14 +224,15 @@ public static class Builder { private int pageSize = DEFAULT_PAGE_SIZE; private int dictPageSize = DEFAULT_DICTIONARY_PAGE_SIZE; private boolean enableDict = DEFAULT_IS_DICTIONARY_ENABLED; - private boolean enableBloomFilter = DEFAULT_BLOOM_FILTER_ENABLED; - private HashMap bloomFilterExpectValues = new HashMap<>(); private WriterVersion writerVersion = DEFAULT_WRITER_VERSION; private int minRowCountForPageSizeCheck = DEFAULT_MINIMUM_RECORD_COUNT_FOR_CHECK; private int maxRowCountForPageSizeCheck = DEFAULT_MAXIMUM_RECORD_COUNT_FOR_CHECK; private boolean estimateNextSizeCheck = DEFAULT_ESTIMATE_ROW_COUNT_FOR_PAGE_SIZE_CHECK; private ByteBufferAllocator allocator = new HeapByteBufferAllocator(); private ValuesWriterFactory valuesWriterFactory = DEFAULT_VALUES_WRITER_FACTORY; + private int columnIndexTruncateLength = DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH; + private boolean enableBloomFilter = DEFAULT_BLOOM_FILTER_ENABLED; + private HashMap bloomFilterExpectedDistinctNumbers = new HashMap<>(); private Builder() { } @@ -234,7 +246,7 @@ private Builder(ParquetProperties toCopy) { this.estimateNextSizeCheck = toCopy.estimateNextSizeCheck; this.allocator = toCopy.allocator; this.enableBloomFilter = toCopy.enableBloomFilter; - this.bloomFilterExpectValues = toCopy.bloomFilterExpectValues; + this.bloomFilterExpectedDistinctNumbers = toCopy.bloomFilterExpectedDistinctNumbers; } /** @@ -274,38 +286,6 @@ public Builder withDictionaryPageSize(int dictionaryPageSize) { return this; } - /** - * Set to enable Bloom filter. - * - * @param enableBloomFilter a boolean to indicate whether to enable Bloom filter. - * @return this builder for method chaining. - */ - public Builder withBloomFilterEnabled(boolean enableBloomFilter) { - this.enableBloomFilter = enableBloomFilter; - return this; - } - - /** - * Set Bloom filter info for columns. - * - * @param bloomFilterColumnNames the columns to be enabled for Bloom filter - * @param bloomFilterDistinctNumbers the expected distinct number of values corresponding to columns - * @return this builder for method chaining - */ - public Builder withBloomFilterInfo(String bloomFilterColumnNames, String bloomFilterDistinctNumbers) { - String[] columnNames = bloomFilterColumnNames.split(","); - String[] expectedDistinctNumber = bloomFilterDistinctNumbers.split(","); - - Preconditions.checkArgument(columnNames.length == expectedDistinctNumber.length, - "Column names are not matched to sizes"); - - for (int i = 0; i < columnNames.length; i++) { - this.bloomFilterExpectValues.put(columnNames[i], Long.getLong(expectedDistinctNumber[i])); - } - - return this; - } - /** * Set the {@link WriterVersion format version}. * @@ -349,12 +329,47 @@ public Builder withValuesWriterFactory(ValuesWriterFactory factory) { return this; } + public Builder withColumnIndexTruncateLength(int length) { + Preconditions.checkArgument(length > 0, "Invalid column index min/max truncate length (negative) : %s", length); + this.columnIndexTruncateLength = length; + return this; + } + + /** + * Set to enable Bloom filter. + * + * @param enableBloomFilter a boolean to indicate whether to enable Bloom filter. + * @return this builder for method chaining. + */ + public Builder withBloomFilterEnabled(boolean enableBloomFilter) { + this.enableBloomFilter = enableBloomFilter; + return this; + } + /** + * Set Bloom filter info for columns. + * + * @param bloomFilterColumnNames the columns to be enabled for Bloom filter + * @param bloomFilterDistinctNumbers the expected distinct number of values corresponding to columns + * @return this builder for method chaining + */ + public Builder withBloomFilterInfo(String bloomFilterColumnNames, String bloomFilterDistinctNumbers) { + String[] columnNames = bloomFilterColumnNames.split(","); + String[] expectedDistinctNumber = bloomFilterDistinctNumbers.split(","); + Preconditions.checkArgument(columnNames.length == expectedDistinctNumber.length, + "Column names are not matched to sizes"); + for (int i = 0; i < columnNames.length; i++) { + this.bloomFilterExpectedDistinctNumbers.put(columnNames[i], Long.getLong(expectedDistinctNumber[i])); + } + return this; + } + + public ParquetProperties build() { ParquetProperties properties = new ParquetProperties(writerVersion, pageSize, dictPageSize, enableDict, minRowCountForPageSizeCheck, maxRowCountForPageSizeCheck, - estimateNextSizeCheck, allocator, valuesWriterFactory, - enableBloomFilter, bloomFilterExpectValues); + estimateNextSizeCheck, allocator, valuesWriterFactory, columnIndexTruncateLength, + enableBloomFilter, bloomFilterExpectedDistinctNumbers); // we pass a constructed but uninitialized factory to ParquetProperties above as currently // creation of ValuesWriters is invoked from within ParquetProperties. In the future // we'd like to decouple that and won't need to pass an object to properties and then pass the diff --git a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnReadStoreImpl.java b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnReadStoreImpl.java index 37845961ad..b7e159775f 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnReadStoreImpl.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnReadStoreImpl.java @@ -18,6 +18,9 @@ */ package org.apache.parquet.column.impl; +import java.util.Optional; +import java.util.PrimitiveIterator; + import org.apache.parquet.VersionParser; import org.apache.parquet.VersionParser.ParsedVersion; import org.apache.parquet.VersionParser.VersionParseException; @@ -72,10 +75,17 @@ public ColumnReadStoreImpl(PageReadStore pageReadStore, @Override public ColumnReader getColumnReader(ColumnDescriptor path) { - return newMemColumnReader(path, pageReadStore.getPageReader(path)); + PrimitiveConverter converter = getPrimitiveConverter(path); + PageReader pageReader = pageReadStore.getPageReader(path); + Optional rowIndexes = pageReadStore.getRowIndexes(); + if (rowIndexes.isPresent()) { + return new SynchronizingColumnReader(path, pageReader, converter, writerVersion, rowIndexes.get()); + } else { + return new ColumnReaderImpl(path, pageReader, converter, writerVersion); + } } - private ColumnReaderImpl newMemColumnReader(ColumnDescriptor path, PageReader pageReader) { + public ColumnReaderImpl newMemColumnReader(ColumnDescriptor path, PageReader pageReader) { PrimitiveConverter converter = getPrimitiveConverter(path); return new ColumnReaderImpl(path, pageReader, converter, writerVersion); } diff --git a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnReaderBase.java b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnReaderBase.java new file mode 100644 index 0000000000..c929431c64 --- /dev/null +++ b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnReaderBase.java @@ -0,0 +1,760 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.column.impl; + +import static java.lang.String.format; +import static org.apache.parquet.Preconditions.checkNotNull; +import static org.apache.parquet.column.ValuesType.DEFINITION_LEVEL; +import static org.apache.parquet.column.ValuesType.REPETITION_LEVEL; +import static org.apache.parquet.column.ValuesType.VALUES; + +import java.io.IOException; + +import org.apache.parquet.CorruptDeltaByteArrays; +import org.apache.parquet.VersionParser.ParsedVersion; +import org.apache.parquet.bytes.ByteBufferInputStream; +import org.apache.parquet.bytes.BytesInput; +import org.apache.parquet.bytes.BytesUtils; +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.column.ColumnReader; +import org.apache.parquet.column.Dictionary; +import org.apache.parquet.column.Encoding; +import org.apache.parquet.column.page.DataPage; +import org.apache.parquet.column.page.DataPageV1; +import org.apache.parquet.column.page.DataPageV2; +import org.apache.parquet.column.page.DictionaryPage; +import org.apache.parquet.column.page.PageReader; +import org.apache.parquet.column.values.RequiresPreviousReader; +import org.apache.parquet.column.values.ValuesReader; +import org.apache.parquet.column.values.rle.RunLengthBitPackingHybridDecoder; +import org.apache.parquet.io.ParquetDecodingException; +import org.apache.parquet.io.api.Binary; +import org.apache.parquet.io.api.PrimitiveConverter; +import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; +import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeNameConverter; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Base superclass for {@link ColumnReader} implementations. + */ +abstract class ColumnReaderBase implements ColumnReader { + private static final Logger LOG = LoggerFactory.getLogger(ColumnReaderBase.class); + + /** + * binds the lower level page decoder to the record converter materializing the records + */ + private static abstract class Binding { + + /** + * read one value from the underlying page + */ + abstract void read(); + + /** + * skip one value from the underlying page + */ + abstract void skip(); + + /** + * Skips n values from the underlying page + * + * @param n + * the number of values to be skipped + */ + abstract void skip(int n); + + /** + * write current value to converter + */ + abstract void writeValue(); + + /** + * @return current value + */ + public int getDictionaryId() { + throw new UnsupportedOperationException(); + } + + /** + * @return current value + */ + public int getInteger() { + throw new UnsupportedOperationException(); + } + + /** + * @return current value + */ + public boolean getBoolean() { + throw new UnsupportedOperationException(); + } + + /** + * @return current value + */ + public long getLong() { + throw new UnsupportedOperationException(); + } + + /** + * @return current value + */ + public Binary getBinary() { + throw new UnsupportedOperationException(); + } + + /** + * @return current value + */ + public float getFloat() { + throw new UnsupportedOperationException(); + } + + /** + * @return current value + */ + public double getDouble() { + throw new UnsupportedOperationException(); + } + } + + private final ParsedVersion writerVersion; + private final ColumnDescriptor path; + private final long totalValueCount; + private final PageReader pageReader; + private final Dictionary dictionary; + + private IntIterator repetitionLevelColumn; + private IntIterator definitionLevelColumn; + protected ValuesReader dataColumn; + private Encoding currentEncoding; + + private int repetitionLevel; + private int definitionLevel; + private int dictionaryId; + + private long endOfPageValueCount; + private long readValues = 0; + private int pageValueCount = 0; + + private final PrimitiveConverter converter; + private Binding binding; + private final int maxDefinitionLevel; + + // this is needed because we will attempt to read the value twice when filtering + // TODO: rework that + private boolean valueRead; + + private void bindToDictionary(final Dictionary dictionary) { + binding = + new Binding() { + void read() { + dictionaryId = dataColumn.readValueDictionaryId(); + } + public void skip() { + dataColumn.skip(); + } + @Override + void skip(int n) { + dataColumn.skip(n); + } + public int getDictionaryId() { + return dictionaryId; + } + void writeValue() { + converter.addValueFromDictionary(dictionaryId); + } + public int getInteger() { + return dictionary.decodeToInt(dictionaryId); + } + public boolean getBoolean() { + return dictionary.decodeToBoolean(dictionaryId); + } + public long getLong() { + return dictionary.decodeToLong(dictionaryId); + } + public Binary getBinary() { + return dictionary.decodeToBinary(dictionaryId); + } + public float getFloat() { + return dictionary.decodeToFloat(dictionaryId); + } + public double getDouble() { + return dictionary.decodeToDouble(dictionaryId); + } + }; + } + + private void bind(PrimitiveTypeName type) { + binding = type.convert(new PrimitiveTypeNameConverter() { + @Override + public Binding convertFLOAT(PrimitiveTypeName primitiveTypeName) throws RuntimeException { + return new Binding() { + float current; + void read() { + current = dataColumn.readFloat(); + } + public void skip() { + current = 0; + dataColumn.skip(); + } + @Override + void skip(int n) { + current = 0; + dataColumn.skip(n); + } + public float getFloat() { + return current; + } + void writeValue() { + converter.addFloat(current); + } + }; + } + @Override + public Binding convertDOUBLE(PrimitiveTypeName primitiveTypeName) throws RuntimeException { + return new Binding() { + double current; + void read() { + current = dataColumn.readDouble(); + } + public void skip() { + current = 0; + dataColumn.skip(); + } + @Override + void skip(int n) { + current = 0; + dataColumn.skip(n); + } + public double getDouble() { + return current; + } + void writeValue() { + converter.addDouble(current); + } + }; + } + @Override + public Binding convertINT32(PrimitiveTypeName primitiveTypeName) throws RuntimeException { + return new Binding() { + int current; + void read() { + current = dataColumn.readInteger(); + } + public void skip() { + current = 0; + dataColumn.skip(); + } + @Override + void skip(int n) { + current = 0; + dataColumn.skip(n); + } + @Override + public int getInteger() { + return current; + } + void writeValue() { + converter.addInt(current); + } + }; + } + @Override + public Binding convertINT64(PrimitiveTypeName primitiveTypeName) throws RuntimeException { + return new Binding() { + long current; + void read() { + current = dataColumn.readLong(); + } + public void skip() { + current = 0; + dataColumn.skip(); + } + @Override + void skip(int n) { + current = 0; + dataColumn.skip(n); + } + @Override + public long getLong() { + return current; + } + void writeValue() { + converter.addLong(current); + } + }; + } + @Override + public Binding convertINT96(PrimitiveTypeName primitiveTypeName) throws RuntimeException { + return this.convertBINARY(primitiveTypeName); + } + @Override + public Binding convertFIXED_LEN_BYTE_ARRAY( + PrimitiveTypeName primitiveTypeName) throws RuntimeException { + return this.convertBINARY(primitiveTypeName); + } + @Override + public Binding convertBOOLEAN(PrimitiveTypeName primitiveTypeName) throws RuntimeException { + return new Binding() { + boolean current; + void read() { + current = dataColumn.readBoolean(); + } + public void skip() { + current = false; + dataColumn.skip(); + } + @Override + void skip(int n) { + current = false; + dataColumn.skip(n); + } + @Override + public boolean getBoolean() { + return current; + } + void writeValue() { + converter.addBoolean(current); + } + }; + } + @Override + public Binding convertBINARY(PrimitiveTypeName primitiveTypeName) throws RuntimeException { + return new Binding() { + Binary current; + void read() { + current = dataColumn.readBytes(); + } + public void skip() { + current = null; + dataColumn.skip(); + } + @Override + void skip(int n) { + current = null; + dataColumn.skip(n); + } + @Override + public Binary getBinary() { + return current; + } + void writeValue() { + converter.addBinary(current); + } + }; + } + }); + } + + /** + * creates a reader for triplets + * @param path the descriptor for the corresponding column + * @param pageReader the underlying store to read from + * @param converter a converter that materializes the values in this column in the current record + * @param writerVersion writer version string from the Parquet file being read + */ + ColumnReaderBase(ColumnDescriptor path, PageReader pageReader, PrimitiveConverter converter, ParsedVersion writerVersion) { + this.path = checkNotNull(path, "path"); + this.pageReader = checkNotNull(pageReader, "pageReader"); + this.converter = checkNotNull(converter, "converter"); + this.writerVersion = writerVersion; + this.maxDefinitionLevel = path.getMaxDefinitionLevel(); + DictionaryPage dictionaryPage = pageReader.readDictionaryPage(); + if (dictionaryPage != null) { + try { + this.dictionary = dictionaryPage.getEncoding().initDictionary(path, dictionaryPage); + if (converter.hasDictionarySupport()) { + converter.setDictionary(dictionary); + } + } catch (IOException e) { + throw new ParquetDecodingException("could not decode the dictionary for " + path, e); + } + } else { + this.dictionary = null; + } + this.totalValueCount = pageReader.getTotalValueCount(); + if (totalValueCount <= 0) { + throw new ParquetDecodingException("totalValueCount '" + totalValueCount + "' <= 0"); + } + } + + boolean isFullyConsumed() { + return readValues >= totalValueCount; + } + + /** + * {@inheritDoc} + * @see org.apache.parquet.column.ColumnReader#writeCurrentValueToConverter() + */ + @Override + public void writeCurrentValueToConverter() { + readValue(); + this.binding.writeValue(); + } + + @Override + public int getCurrentValueDictionaryID() { + readValue(); + return binding.getDictionaryId(); + } + + /** + * {@inheritDoc} + * @see org.apache.parquet.column.ColumnReader#getInteger() + */ + @Override + public int getInteger() { + readValue(); + return this.binding.getInteger(); + } + + /** + * {@inheritDoc} + * @see org.apache.parquet.column.ColumnReader#getBoolean() + */ + @Override + public boolean getBoolean() { + readValue(); + return this.binding.getBoolean(); + } + + /** + * {@inheritDoc} + * @see org.apache.parquet.column.ColumnReader#getLong() + */ + @Override + public long getLong() { + readValue(); + return this.binding.getLong(); + } + + /** + * {@inheritDoc} + * @see org.apache.parquet.column.ColumnReader#getBinary() + */ + @Override + public Binary getBinary() { + readValue(); + return this.binding.getBinary(); + } + + /** + * {@inheritDoc} + * @see org.apache.parquet.column.ColumnReader#getFloat() + */ + @Override + public float getFloat() { + readValue(); + return this.binding.getFloat(); + } + + /** + * {@inheritDoc} + * @see org.apache.parquet.column.ColumnReader#getDouble() + */ + @Override + public double getDouble() { + readValue(); + return this.binding.getDouble(); + } + + /** + * {@inheritDoc} + * @see org.apache.parquet.column.ColumnReader#getCurrentRepetitionLevel() + */ + @Override + public int getCurrentRepetitionLevel() { + return repetitionLevel; + } + + /** + * {@inheritDoc} + * @see org.apache.parquet.column.ColumnReader#getDescriptor() + */ + @Override + public ColumnDescriptor getDescriptor() { + return path; + } + + /** + * Reads the value into the binding. + */ + public void readValue() { + try { + if (!valueRead) { + binding.read(); + valueRead = true; + } + } catch (RuntimeException e) { + if (CorruptDeltaByteArrays.requiresSequentialReads(writerVersion, currentEncoding) && + e instanceof ArrayIndexOutOfBoundsException) { + // this is probably PARQUET-246, which may happen if reading data with + // MR because this can't be detected without reading all footers + throw new ParquetDecodingException("Read failure possibly due to " + + "PARQUET-246: try setting parquet.split.files to false", + new ParquetDecodingException( + format("Can't read value in column %s at value %d out of %d, " + + "%d out of %d in currentPage. repetition level: " + + "%d, definition level: %d", + path, readValues, totalValueCount, + readValues - (endOfPageValueCount - pageValueCount), + pageValueCount, repetitionLevel, definitionLevel), + e)); + } + throw new ParquetDecodingException( + format("Can't read value in column %s at value %d out of %d, " + + "%d out of %d in currentPage. repetition level: " + + "%d, definition level: %d", + path, readValues, totalValueCount, + readValues - (endOfPageValueCount - pageValueCount), + pageValueCount, repetitionLevel, definitionLevel), + e); + } + } + + /** + * {@inheritDoc} + * @see org.apache.parquet.column.ColumnReader#skip() + */ + @Override + public void skip() { + if (!valueRead) { + binding.skip(); + valueRead = true; + } + } + + /** + * {@inheritDoc} + * @see org.apache.parquet.column.ColumnReader#getCurrentDefinitionLevel() + */ + @Override + public int getCurrentDefinitionLevel() { + return definitionLevel; + } + + private void checkRead() { + int rl, dl; + int skipValues = 0; + for (;;) { + if (isPageFullyConsumed()) { + if (isFullyConsumed()) { + LOG.debug("end reached"); + repetitionLevel = 0; // the next repetition level + return; + } + readPage(); + skipValues = 0; + } + rl = repetitionLevelColumn.nextInt(); + dl = definitionLevelColumn.nextInt(); + ++readValues; + if (!skipRL(rl)) { + break; + } + if (dl == maxDefinitionLevel) { + ++skipValues; + } + } + binding.skip(skipValues); + repetitionLevel = rl; + definitionLevel = dl; + } + + /* + * Returns if current levels / value shall be skipped based on the specified repetition level. + */ + abstract boolean skipRL(int rl); + + private void readPage() { + LOG.debug("loading page"); + DataPage page = pageReader.readPage(); + page.accept(new DataPage.Visitor() { + @Override + public Void visit(DataPageV1 dataPageV1) { + readPageV1(dataPageV1); + return null; + } + @Override + public Void visit(DataPageV2 dataPageV2) { + readPageV2(dataPageV2); + return null; + } + }); + } + + private void initDataReader(Encoding dataEncoding, ByteBufferInputStream in, int valueCount) { + ValuesReader previousReader = this.dataColumn; + + this.currentEncoding = dataEncoding; + this.pageValueCount = valueCount; + this.endOfPageValueCount = readValues + pageValueCount; + + if (dataEncoding.usesDictionary()) { + if (dictionary == null) { + throw new ParquetDecodingException( + "could not read page in col " + path + " as the dictionary was missing for encoding " + dataEncoding); + } + this.dataColumn = dataEncoding.getDictionaryBasedValuesReader(path, VALUES, dictionary); + } else { + this.dataColumn = dataEncoding.getValuesReader(path, VALUES); + } + + if (dataEncoding.usesDictionary() && converter.hasDictionarySupport()) { + bindToDictionary(dictionary); + } else { + bind(path.getType()); + } + + try { + dataColumn.initFromPage(pageValueCount, in); + } catch (IOException e) { + throw new ParquetDecodingException("could not read page in col " + path, e); + } + + if (CorruptDeltaByteArrays.requiresSequentialReads(writerVersion, dataEncoding) && + previousReader != null && previousReader instanceof RequiresPreviousReader) { + // previous reader can only be set if reading sequentially + ((RequiresPreviousReader) dataColumn).setPreviousReader(previousReader); + } + } + + private void readPageV1(DataPageV1 page) { + ValuesReader rlReader = page.getRlEncoding().getValuesReader(path, REPETITION_LEVEL); + ValuesReader dlReader = page.getDlEncoding().getValuesReader(path, DEFINITION_LEVEL); + this.repetitionLevelColumn = new ValuesReaderIntIterator(rlReader); + this.definitionLevelColumn = new ValuesReaderIntIterator(dlReader); + int valueCount = page.getValueCount(); + try { + BytesInput bytes = page.getBytes(); + LOG.debug("page size {} bytes and {} values", bytes.size(), valueCount); + LOG.debug("reading repetition levels at 0"); + ByteBufferInputStream in = bytes.toInputStream(); + rlReader.initFromPage(valueCount, in); + LOG.debug("reading definition levels at {}", in.position()); + dlReader.initFromPage(valueCount, in); + LOG.debug("reading data at {}", in.position()); + initDataReader(page.getValueEncoding(), in, valueCount); + } catch (IOException e) { + throw new ParquetDecodingException("could not read page " + page + " in col " + path, e); + } + newPageInitialized(page); + } + + private void readPageV2(DataPageV2 page) { + this.repetitionLevelColumn = newRLEIterator(path.getMaxRepetitionLevel(), page.getRepetitionLevels()); + this.definitionLevelColumn = newRLEIterator(path.getMaxDefinitionLevel(), page.getDefinitionLevels()); + int valueCount = page.getValueCount(); + LOG.debug("page data size {} bytes and {} values", page.getData().size(), valueCount); + try { + initDataReader(page.getDataEncoding(), page.getData().toInputStream(), valueCount); + } catch (IOException e) { + throw new ParquetDecodingException("could not read page " + page + " in col " + path, e); + } + newPageInitialized(page); + } + + final int getPageValueCount() { + return pageValueCount; + } + + abstract void newPageInitialized(DataPage page); + + private IntIterator newRLEIterator(int maxLevel, BytesInput bytes) { + try { + if (maxLevel == 0) { + return new NullIntIterator(); + } + return new RLEIntIterator( + new RunLengthBitPackingHybridDecoder( + BytesUtils.getWidthFromMaxInt(maxLevel), + bytes.toInputStream())); + } catch (IOException e) { + throw new ParquetDecodingException("could not read levels in page for col " + path, e); + } + } + + boolean isPageFullyConsumed() { + return readValues >= endOfPageValueCount; + } + + /** + * {@inheritDoc} + * @see org.apache.parquet.column.ColumnReader#consume() + */ + @Override + public void consume() { + checkRead(); + valueRead = false; + } + + /** + * {@inheritDoc} + * @see org.apache.parquet.column.ColumnReader#getTotalValueCount() + */ + @Deprecated + @Override + public long getTotalValueCount() { + return totalValueCount; + } + + static abstract class IntIterator { + abstract int nextInt(); + } + + static class ValuesReaderIntIterator extends IntIterator { + ValuesReader delegate; + + public ValuesReaderIntIterator(ValuesReader delegate) { + super(); + this.delegate = delegate; + } + + @Override + int nextInt() { + return delegate.readInteger(); + } + } + + static class RLEIntIterator extends IntIterator { + RunLengthBitPackingHybridDecoder delegate; + + public RLEIntIterator(RunLengthBitPackingHybridDecoder delegate) { + this.delegate = delegate; + } + + @Override + int nextInt() { + try { + return delegate.readInt(); + } catch (IOException e) { + throw new ParquetDecodingException(e); + } + } + } + + private static final class NullIntIterator extends IntIterator { + @Override + int nextInt() { + return 0; + } + } +} diff --git a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnReaderImpl.java b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnReaderImpl.java index 8c85b37f8e..0413d621c1 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnReaderImpl.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnReaderImpl.java @@ -1,4 +1,4 @@ -/* +/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -6,9 +6,9 @@ * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -18,675 +18,41 @@ */ package org.apache.parquet.column.impl; -import static java.lang.String.format; -import static org.apache.parquet.Preconditions.checkNotNull; -import static org.apache.parquet.column.ValuesType.DEFINITION_LEVEL; -import static org.apache.parquet.column.ValuesType.REPETITION_LEVEL; -import static org.apache.parquet.column.ValuesType.VALUES; - -import java.io.IOException; - -import org.apache.parquet.CorruptDeltaByteArrays; import org.apache.parquet.VersionParser.ParsedVersion; -import org.apache.parquet.bytes.ByteBufferInputStream; -import org.apache.parquet.bytes.BytesInput; -import org.apache.parquet.bytes.BytesUtils; import org.apache.parquet.column.ColumnDescriptor; -import org.apache.parquet.column.ColumnReader; -import org.apache.parquet.column.Dictionary; -import org.apache.parquet.column.Encoding; import org.apache.parquet.column.page.DataPage; -import org.apache.parquet.column.page.DataPageV1; -import org.apache.parquet.column.page.DataPageV2; -import org.apache.parquet.column.page.DictionaryPage; import org.apache.parquet.column.page.PageReader; -import org.apache.parquet.column.values.RequiresPreviousReader; -import org.apache.parquet.column.values.ValuesReader; -import org.apache.parquet.column.values.rle.RunLengthBitPackingHybridDecoder; -import org.apache.parquet.io.ParquetDecodingException; -import org.apache.parquet.io.api.Binary; import org.apache.parquet.io.api.PrimitiveConverter; -import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; -import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeNameConverter; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; /** - * ColumnReader implementation + * ColumnReader implementation for the scenario when column indexes are not used (all values are read) */ -public class ColumnReaderImpl implements ColumnReader { - private static final Logger LOG = LoggerFactory.getLogger(ColumnReaderImpl.class); - - /** - * binds the lower level page decoder to the record converter materializing the records - */ - private static abstract class Binding { - - /** - * read one value from the underlying page - */ - abstract void read(); - - /** - * skip one value from the underlying page - */ - abstract void skip(); - - /** - * write current value to converter - */ - abstract void writeValue(); - - /** - * @return current value - */ - public int getDictionaryId() { - throw new UnsupportedOperationException(); - } - - /** - * @return current value - */ - public int getInteger() { - throw new UnsupportedOperationException(); - } - - /** - * @return current value - */ - public boolean getBoolean() { - throw new UnsupportedOperationException(); - } - - /** - * @return current value - */ - public long getLong() { - throw new UnsupportedOperationException(); - } - - /** - * @return current value - */ - public Binary getBinary() { - throw new UnsupportedOperationException(); - } - - /** - * @return current value - */ - public float getFloat() { - throw new UnsupportedOperationException(); - } - - /** - * @return current value - */ - public double getDouble() { - throw new UnsupportedOperationException(); - } - } - - private final ParsedVersion writerVersion; - private final ColumnDescriptor path; - private final long totalValueCount; - private final PageReader pageReader; - private final Dictionary dictionary; - - private IntIterator repetitionLevelColumn; - private IntIterator definitionLevelColumn; - protected ValuesReader dataColumn; - private Encoding currentEncoding; - - private int repetitionLevel; - private int definitionLevel; - private int dictionaryId; - - private long endOfPageValueCount; - private long readValues = 0; - private int pageValueCount = 0; - - private final PrimitiveConverter converter; - private Binding binding; - - // this is needed because we will attempt to read the value twice when filtering - // TODO: rework that - private boolean valueRead; - - private void bindToDictionary(final Dictionary dictionary) { - binding = - new Binding() { - void read() { - dictionaryId = dataColumn.readValueDictionaryId(); - } - public void skip() { - dataColumn.skip(); - } - public int getDictionaryId() { - return dictionaryId; - } - void writeValue() { - converter.addValueFromDictionary(dictionaryId); - } - public int getInteger() { - return dictionary.decodeToInt(dictionaryId); - } - public boolean getBoolean() { - return dictionary.decodeToBoolean(dictionaryId); - } - public long getLong() { - return dictionary.decodeToLong(dictionaryId); - } - public Binary getBinary() { - return dictionary.decodeToBinary(dictionaryId); - } - public float getFloat() { - return dictionary.decodeToFloat(dictionaryId); - } - public double getDouble() { - return dictionary.decodeToDouble(dictionaryId); - } - }; - } - - private void bind(PrimitiveTypeName type) { - binding = type.convert(new PrimitiveTypeNameConverter() { - @Override - public Binding convertFLOAT(PrimitiveTypeName primitiveTypeName) throws RuntimeException { - return new Binding() { - float current; - void read() { - current = dataColumn.readFloat(); - } - public void skip() { - current = 0; - dataColumn.skip(); - } - public float getFloat() { - return current; - } - void writeValue() { - converter.addFloat(current); - } - }; - } - @Override - public Binding convertDOUBLE(PrimitiveTypeName primitiveTypeName) throws RuntimeException { - return new Binding() { - double current; - void read() { - current = dataColumn.readDouble(); - } - public void skip() { - current = 0; - dataColumn.skip(); - } - public double getDouble() { - return current; - } - void writeValue() { - converter.addDouble(current); - } - }; - } - @Override - public Binding convertINT32(PrimitiveTypeName primitiveTypeName) throws RuntimeException { - return new Binding() { - int current; - void read() { - current = dataColumn.readInteger(); - } - public void skip() { - current = 0; - dataColumn.skip(); - } - @Override - public int getInteger() { - return current; - } - void writeValue() { - converter.addInt(current); - } - }; - } - @Override - public Binding convertINT64(PrimitiveTypeName primitiveTypeName) throws RuntimeException { - return new Binding() { - long current; - void read() { - current = dataColumn.readLong(); - } - public void skip() { - current = 0; - dataColumn.skip(); - } - @Override - public long getLong() { - return current; - } - void writeValue() { - converter.addLong(current); - } - }; - } - @Override - public Binding convertINT96(PrimitiveTypeName primitiveTypeName) throws RuntimeException { - return this.convertBINARY(primitiveTypeName); - } - @Override - public Binding convertFIXED_LEN_BYTE_ARRAY( - PrimitiveTypeName primitiveTypeName) throws RuntimeException { - return this.convertBINARY(primitiveTypeName); - } - @Override - public Binding convertBOOLEAN(PrimitiveTypeName primitiveTypeName) throws RuntimeException { - return new Binding() { - boolean current; - void read() { - current = dataColumn.readBoolean(); - } - public void skip() { - current = false; - dataColumn.skip(); - } - @Override - public boolean getBoolean() { - return current; - } - void writeValue() { - converter.addBoolean(current); - } - }; - } - @Override - public Binding convertBINARY(PrimitiveTypeName primitiveTypeName) throws RuntimeException { - return new Binding() { - Binary current; - void read() { - current = dataColumn.readBytes(); - } - public void skip() { - current = null; - dataColumn.skip(); - } - @Override - public Binary getBinary() { - return current; - } - void writeValue() { - converter.addBinary(current); - } - }; - } - }); - } +public class ColumnReaderImpl extends ColumnReaderBase { /** * creates a reader for triplets - * @param path the descriptor for the corresponding column - * @param pageReader the underlying store to read from - * @param converter a converter that materializes the values in this column in the current record - * @param writerVersion writer version string from the Parquet file being read - */ - public ColumnReaderImpl(ColumnDescriptor path, PageReader pageReader, PrimitiveConverter converter, ParsedVersion writerVersion) { - this.path = checkNotNull(path, "path"); - this.pageReader = checkNotNull(pageReader, "pageReader"); - this.converter = checkNotNull(converter, "converter"); - this.writerVersion = writerVersion; - DictionaryPage dictionaryPage = pageReader.readDictionaryPage(); - if (dictionaryPage != null) { - try { - this.dictionary = dictionaryPage.getEncoding().initDictionary(path, dictionaryPage); - if (converter.hasDictionarySupport()) { - converter.setDictionary(dictionary); - } - } catch (IOException e) { - throw new ParquetDecodingException("could not decode the dictionary for " + path, e); - } - } else { - this.dictionary = null; - } - this.totalValueCount = pageReader.getTotalValueCount(); - if (totalValueCount <= 0) { - throw new ParquetDecodingException("totalValueCount '" + totalValueCount + "' <= 0"); - } + * + * @param path + * the descriptor for the corresponding column + * @param pageReader + * the underlying store to read from + * @param converter + * a converter that materializes the values in this column in the current record + * @param writerVersion + * writer version string from the Parquet file being read + */ + public ColumnReaderImpl(ColumnDescriptor path, PageReader pageReader, PrimitiveConverter converter, + ParsedVersion writerVersion) { + super(path, pageReader, converter, writerVersion); consume(); } - private boolean isFullyConsumed() { - return readValues >= totalValueCount; - } - - /** - * {@inheritDoc} - * @see org.apache.parquet.column.ColumnReader#writeCurrentValueToConverter() - */ - @Override - public void writeCurrentValueToConverter() { - readValue(); - this.binding.writeValue(); - } - - @Override - public int getCurrentValueDictionaryID() { - readValue(); - return binding.getDictionaryId(); - } - - /** - * {@inheritDoc} - * @see org.apache.parquet.column.ColumnReader#getInteger() - */ - @Override - public int getInteger() { - readValue(); - return this.binding.getInteger(); - } - - /** - * {@inheritDoc} - * @see org.apache.parquet.column.ColumnReader#getBoolean() - */ - @Override - public boolean getBoolean() { - readValue(); - return this.binding.getBoolean(); - } - - /** - * {@inheritDoc} - * @see org.apache.parquet.column.ColumnReader#getLong() - */ - @Override - public long getLong() { - readValue(); - return this.binding.getLong(); - } - - /** - * {@inheritDoc} - * @see org.apache.parquet.column.ColumnReader#getBinary() - */ - @Override - public Binary getBinary() { - readValue(); - return this.binding.getBinary(); - } - - /** - * {@inheritDoc} - * @see org.apache.parquet.column.ColumnReader#getFloat() - */ - @Override - public float getFloat() { - readValue(); - return this.binding.getFloat(); - } - - /** - * {@inheritDoc} - * @see org.apache.parquet.column.ColumnReader#getDouble() - */ - @Override - public double getDouble() { - readValue(); - return this.binding.getDouble(); - } - - /** - * {@inheritDoc} - * @see org.apache.parquet.column.ColumnReader#getCurrentRepetitionLevel() - */ - @Override - public int getCurrentRepetitionLevel() { - return repetitionLevel; - } - - /** - * {@inheritDoc} - * @see org.apache.parquet.column.ColumnReader#getDescriptor() - */ - @Override - public ColumnDescriptor getDescriptor() { - return path; - } - - /** - * Reads the value into the binding. - */ - public void readValue() { - try { - if (!valueRead) { - binding.read(); - valueRead = true; - } - } catch (RuntimeException e) { - if (CorruptDeltaByteArrays.requiresSequentialReads(writerVersion, currentEncoding) && - e instanceof ArrayIndexOutOfBoundsException) { - // this is probably PARQUET-246, which may happen if reading data with - // MR because this can't be detected without reading all footers - throw new ParquetDecodingException("Read failure possibly due to " + - "PARQUET-246: try setting parquet.split.files to false", - new ParquetDecodingException( - format("Can't read value in column %s at value %d out of %d, " + - "%d out of %d in currentPage. repetition level: " + - "%d, definition level: %d", - path, readValues, totalValueCount, - readValues - (endOfPageValueCount - pageValueCount), - pageValueCount, repetitionLevel, definitionLevel), - e)); - } - throw new ParquetDecodingException( - format("Can't read value in column %s at value %d out of %d, " + - "%d out of %d in currentPage. repetition level: " + - "%d, definition level: %d", - path, readValues, totalValueCount, - readValues - (endOfPageValueCount - pageValueCount), - pageValueCount, repetitionLevel, definitionLevel), - e); - } - } - - /** - * {@inheritDoc} - * @see org.apache.parquet.column.ColumnReader#skip() - */ - @Override - public void skip() { - if (!valueRead) { - binding.skip(); - valueRead = true; - } - } - - /** - * {@inheritDoc} - * @see org.apache.parquet.column.ColumnReader#getCurrentDefinitionLevel() - */ - @Override - public int getCurrentDefinitionLevel() { - return definitionLevel; - } - - // TODO: change the logic around read() to not tie together reading from the 3 columns - private void readRepetitionAndDefinitionLevels() { - repetitionLevel = repetitionLevelColumn.nextInt(); - definitionLevel = definitionLevelColumn.nextInt(); - ++readValues; - } - - private void checkRead() { - if (isPageFullyConsumed()) { - if (isFullyConsumed()) { - LOG.debug("end reached"); - repetitionLevel = 0; // the next repetition level - return; - } - readPage(); - } - readRepetitionAndDefinitionLevels(); - } - - private void readPage() { - LOG.debug("loading page"); - DataPage page = pageReader.readPage(); - page.accept(new DataPage.Visitor() { - @Override - public Void visit(DataPageV1 dataPageV1) { - readPageV1(dataPageV1); - return null; - } - @Override - public Void visit(DataPageV2 dataPageV2) { - readPageV2(dataPageV2); - return null; - } - }); - } - - private void initDataReader(Encoding dataEncoding, ByteBufferInputStream in, int valueCount) { - ValuesReader previousReader = this.dataColumn; - - this.currentEncoding = dataEncoding; - this.pageValueCount = valueCount; - this.endOfPageValueCount = readValues + pageValueCount; - - if (dataEncoding.usesDictionary()) { - if (dictionary == null) { - throw new ParquetDecodingException( - "could not read page in col " + path + " as the dictionary was missing for encoding " + dataEncoding); - } - this.dataColumn = dataEncoding.getDictionaryBasedValuesReader(path, VALUES, dictionary); - } else { - this.dataColumn = dataEncoding.getValuesReader(path, VALUES); - } - - if (dataEncoding.usesDictionary() && converter.hasDictionarySupport()) { - bindToDictionary(dictionary); - } else { - bind(path.getType()); - } - - try { - dataColumn.initFromPage(pageValueCount, in); - } catch (IOException e) { - throw new ParquetDecodingException("could not read page in col " + path, e); - } - - if (CorruptDeltaByteArrays.requiresSequentialReads(writerVersion, dataEncoding) && - previousReader != null && previousReader instanceof RequiresPreviousReader) { - // previous reader can only be set if reading sequentially - ((RequiresPreviousReader) dataColumn).setPreviousReader(previousReader); - } - } - - private void readPageV1(DataPageV1 page) { - ValuesReader rlReader = page.getRlEncoding().getValuesReader(path, REPETITION_LEVEL); - ValuesReader dlReader = page.getDlEncoding().getValuesReader(path, DEFINITION_LEVEL); - this.repetitionLevelColumn = new ValuesReaderIntIterator(rlReader); - this.definitionLevelColumn = new ValuesReaderIntIterator(dlReader); - try { - BytesInput bytes = page.getBytes(); - LOG.debug("page size {} bytes and {} records", bytes.size(), pageValueCount); - LOG.debug("reading repetition levels at 0"); - ByteBufferInputStream in = bytes.toInputStream(); - rlReader.initFromPage(pageValueCount, in); - LOG.debug("reading definition levels at {}", in.position()); - dlReader.initFromPage(pageValueCount, in); - LOG.debug("reading data at {}", in.position()); - initDataReader(page.getValueEncoding(), in, page.getValueCount()); - } catch (IOException e) { - throw new ParquetDecodingException("could not read page " + page + " in col " + path, e); - } - } - - private void readPageV2(DataPageV2 page) { - this.repetitionLevelColumn = newRLEIterator(path.getMaxRepetitionLevel(), page.getRepetitionLevels()); - this.definitionLevelColumn = newRLEIterator(path.getMaxDefinitionLevel(), page.getDefinitionLevels()); - LOG.debug("page data size {} bytes and {} records", page.getData().size(), pageValueCount); - try { - initDataReader(page.getDataEncoding(), page.getData().toInputStream(), page.getValueCount()); - } catch (IOException e) { - throw new ParquetDecodingException("could not read page " + page + " in col " + path, e); - } - } - - private IntIterator newRLEIterator(int maxLevel, BytesInput bytes) { - try { - if (maxLevel == 0) { - return new NullIntIterator(); - } - return new RLEIntIterator( - new RunLengthBitPackingHybridDecoder( - BytesUtils.getWidthFromMaxInt(maxLevel), - bytes.toInputStream())); - } catch (IOException e) { - throw new ParquetDecodingException("could not read levels in page for col " + path, e); - } - } - - private boolean isPageFullyConsumed() { - return readValues >= endOfPageValueCount; - } - - /** - * {@inheritDoc} - * @see org.apache.parquet.column.ColumnReader#consume() - */ @Override - public void consume() { - checkRead(); - valueRead = false; + boolean skipRL(int rl) { + return false; } - /** - * {@inheritDoc} - * @see org.apache.parquet.column.ColumnReader#getTotalValueCount() - */ @Override - public long getTotalValueCount() { - return totalValueCount; - } - - static abstract class IntIterator { - abstract int nextInt(); - } - - static class ValuesReaderIntIterator extends IntIterator { - ValuesReader delegate; - - public ValuesReaderIntIterator(ValuesReader delegate) { - super(); - this.delegate = delegate; - } - - @Override - int nextInt() { - return delegate.readInteger(); - } - } - - static class RLEIntIterator extends IntIterator { - RunLengthBitPackingHybridDecoder delegate; - - public RLEIntIterator(RunLengthBitPackingHybridDecoder delegate) { - this.delegate = delegate; - } - - @Override - int nextInt() { - try { - return delegate.readInt(); - } catch (IOException e) { - throw new ParquetDecodingException(e); - } - } - } - - private static final class NullIntIterator extends IntIterator { - @Override - int nextInt() { - return 0; - } + void newPageInitialized(DataPage page) { } } diff --git a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriteStoreBase.java b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriteStoreBase.java new file mode 100644 index 0000000000..dc4946e4ff --- /dev/null +++ b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriteStoreBase.java @@ -0,0 +1,255 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.column.impl; + +import static java.lang.Math.max; +import static java.lang.Math.min; +import static java.util.Collections.unmodifiableMap; + +import java.util.Arrays; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Set; +import java.util.TreeMap; + +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.column.ColumnWriteStore; +import org.apache.parquet.column.ColumnWriter; +import org.apache.parquet.column.ParquetProperties; +import org.apache.parquet.column.page.PageWriteStore; +import org.apache.parquet.column.page.PageWriter; +import org.apache.parquet.column.values.bloomfilter.BloomFilterWriteStore; +import org.apache.parquet.column.values.bloomfilter.BloomFilterWriter; +import org.apache.parquet.schema.MessageType; + +/** + * Base implementation for {@link ColumnWriteStore} to be extended to specialize for V1 and V2 pages. + */ +abstract class ColumnWriteStoreBase implements ColumnWriteStore { + + // Used to support the deprecated workflow of ColumnWriteStoreV1 (lazy init of ColumnWriters) + private interface ColumnWriterProvider { + ColumnWriter getColumnWriter(ColumnDescriptor path); + } + + private final ColumnWriterProvider columnWriterProvider; + + // will flush even if size bellow the threshold by this much to facilitate page alignment + private static final float THRESHOLD_TOLERANCE_RATIO = 0.1f; // 10 % + + private final Map columns; + private final ParquetProperties props; + private final long thresholdTolerance; + private long rowCount; + private long rowCountForNextSizeCheck; + + // To be used by the deprecated constructor of ColumnWriteStoreV1 + @Deprecated + ColumnWriteStoreBase( + final PageWriteStore pageWriteStore, + final ParquetProperties props) { + this.props = props; + this.thresholdTolerance = (long) (props.getPageSizeThreshold() * THRESHOLD_TOLERANCE_RATIO); + + this.columns = new TreeMap<>(); + + this.rowCountForNextSizeCheck = props.getMinRowCountForPageSizeCheck(); + + columnWriterProvider = new ColumnWriterProvider() { + @Override + public ColumnWriter getColumnWriter(ColumnDescriptor path) { + ColumnWriterBase column = columns.get(path); + if (column == null) { + column = createColumnWriter(path, pageWriteStore.getPageWriter(path), null, props); + columns.put(path, column); + } + return column; + } + }; + } + + ColumnWriteStoreBase( + MessageType schema, + PageWriteStore pageWriteStore, + ParquetProperties props) { + this.props = props; + this.thresholdTolerance = (long) (props.getPageSizeThreshold() * THRESHOLD_TOLERANCE_RATIO); + Map mcolumns = new TreeMap<>(); + for (ColumnDescriptor path : schema.getColumns()) { + PageWriter pageWriter = pageWriteStore.getPageWriter(path); + mcolumns.put(path, createColumnWriter(path, pageWriter, null, props)); + } + this.columns = unmodifiableMap(mcolumns); + + this.rowCountForNextSizeCheck = props.getMinRowCountForPageSizeCheck(); + + columnWriterProvider = new ColumnWriterProvider() { + @Override + public ColumnWriter getColumnWriter(ColumnDescriptor path) { + return columns.get(path); + } + }; + } + + ColumnWriteStoreBase( + MessageType schema, + PageWriteStore pageWriteStore, + BloomFilterWriteStore bloomFilterWriteStore, + ParquetProperties props) { + this.props = props; + this.thresholdTolerance = (long) (props.getPageSizeThreshold() * THRESHOLD_TOLERANCE_RATIO); + Map mcolumns = new TreeMap<>(); + for (ColumnDescriptor path : schema.getColumns()) { + PageWriter pageWriter = pageWriteStore.getPageWriter(path); + if (props.isBloomFilterEnabled() && props.getBloomFilterExpectedDistinctNumbers() != null) { + BloomFilterWriter bloomFilterWriter = bloomFilterWriteStore.getBloomFilterWriter(path); + mcolumns.put(path, createColumnWriter(path, pageWriter, bloomFilterWriter, props)); + } else { + mcolumns.put(path, createColumnWriter(path, pageWriter, null, props)); + } + } + this.columns = unmodifiableMap(mcolumns); + + this.rowCountForNextSizeCheck = props.getMinRowCountForPageSizeCheck(); + + columnWriterProvider = new ColumnWriterProvider() { + @Override + public ColumnWriter getColumnWriter(ColumnDescriptor path) { + return columns.get(path); + } + }; + } + + abstract ColumnWriterBase createColumnWriter(ColumnDescriptor path, PageWriter pageWriter, + BloomFilterWriter bloomFilterWriter, ParquetProperties props); + + public ColumnWriter getColumnWriter(ColumnDescriptor path) { + return columnWriterProvider.getColumnWriter(path); + } + + public Set getColumnDescriptors() { + return columns.keySet(); + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + for (Entry entry : columns.entrySet()) { + sb.append(Arrays.toString(entry.getKey().getPath())).append(": "); + sb.append(entry.getValue().getTotalBufferedSize()).append(" bytes"); + sb.append("\n"); + } + return sb.toString(); + } + + @Override + public long getAllocatedSize() { + long total = 0; + for (ColumnWriterBase memColumn : columns.values()) { + total += memColumn.allocatedSize(); + } + return total; + } + + @Override + public long getBufferedSize() { + long total = 0; + for (ColumnWriterBase memColumn : columns.values()) { + total += memColumn.getTotalBufferedSize(); + } + return total; + } + + @Override + public void flush() { + for (ColumnWriterBase memColumn : columns.values()) { + long rows = rowCount - memColumn.getRowsWrittenSoFar(); + if (rows > 0) { + memColumn.writePage(); + } + memColumn.finalizeColumnChunk(); + } + } + + public String memUsageString() { + StringBuilder b = new StringBuilder("Store {\n"); + for (ColumnWriterBase memColumn : columns.values()) { + b.append(memColumn.memUsageString(" ")); + } + b.append("}\n"); + return b.toString(); + } + + public long maxColMemSize() { + long max = 0; + for (ColumnWriterBase memColumn : columns.values()) { + max = Math.max(max, memColumn.getBufferedSizeInMemory()); + } + return max; + } + + @Override + public void close() { + flush(); // calling flush() here to keep it consistent with the behavior before merging with master + for (ColumnWriterBase memColumn : columns.values()) { + memColumn.close(); + } + } + + @Override + public void endRecord() { + ++rowCount; + if (rowCount >= rowCountForNextSizeCheck) { + sizeCheck(); + } + } + + private void sizeCheck() { + long minRecordToWait = Long.MAX_VALUE; + for (ColumnWriterBase writer : columns.values()) { + long usedMem = writer.getCurrentPageBufferedSize(); + long rows = rowCount - writer.getRowsWrittenSoFar(); + long remainingMem = props.getPageSizeThreshold() - usedMem; + if (remainingMem <= thresholdTolerance) { + writer.writePage(); + remainingMem = props.getPageSizeThreshold(); + } + long rowsToFillPage = + usedMem == 0 ? + props.getMaxRowCountForPageSizeCheck() + : (long) ((float) rows) / usedMem * remainingMem; + if (rowsToFillPage < minRecordToWait) { + minRecordToWait = rowsToFillPage; + } + } + if (minRecordToWait == Long.MAX_VALUE) { + minRecordToWait = props.getMinRowCountForPageSizeCheck(); + } + + if (props.estimateNextSizeCheck()) { + // will check again halfway if between min and max + rowCountForNextSizeCheck = rowCount + + min( + max(minRecordToWait / 2, props.getMinRowCountForPageSizeCheck()), + props.getMaxRowCountForPageSizeCheck()); + } else { + rowCountForNextSizeCheck = rowCount + props.getMinRowCountForPageSizeCheck(); + } + } +} diff --git a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriteStoreV1.java b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriteStoreV1.java index 7e2876077a..dd13b0b8a4 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriteStoreV1.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriteStoreV1.java @@ -18,135 +18,34 @@ */ package org.apache.parquet.column.impl; -import java.util.Arrays; -import java.util.Collection; -import java.util.Map; -import java.util.Map.Entry; -import java.util.Set; -import java.util.TreeMap; - import org.apache.parquet.column.ColumnDescriptor; -import org.apache.parquet.column.ColumnWriteStore; -import org.apache.parquet.column.ColumnWriter; import org.apache.parquet.column.ParquetProperties; import org.apache.parquet.column.page.PageWriteStore; import org.apache.parquet.column.page.PageWriter; import org.apache.parquet.column.values.bloomfilter.BloomFilterWriteStore; import org.apache.parquet.column.values.bloomfilter.BloomFilterWriter; +import org.apache.parquet.schema.MessageType; -public class ColumnWriteStoreV1 implements ColumnWriteStore { - - private final Map columns = new TreeMap(); - private final PageWriteStore pageWriteStore; - private final ParquetProperties props; - private BloomFilterWriteStore bloomFilterWriteStore; +public class ColumnWriteStoreV1 extends ColumnWriteStoreBase { + public ColumnWriteStoreV1(MessageType schema, PageWriteStore pageWriteStore, ParquetProperties props) { + super(schema, pageWriteStore, props); + } - public ColumnWriteStoreV1(PageWriteStore pageWriteStore, - ParquetProperties props) { - this.pageWriteStore = pageWriteStore; - this.props = props; + @Deprecated + public ColumnWriteStoreV1(final PageWriteStore pageWriteStore, + final ParquetProperties props) { + super(pageWriteStore, props); } - public ColumnWriteStoreV1(PageWriteStore pageWriteStore, + public ColumnWriteStoreV1(MessageType schema, PageWriteStore pageWriteStore, BloomFilterWriteStore bloomFilterWriteStore, ParquetProperties props) { - this (pageWriteStore, props); - this.bloomFilterWriteStore = bloomFilterWriteStore; - } - - public ColumnWriter getColumnWriter(ColumnDescriptor path) { - ColumnWriterV1 column = columns.get(path); - if (column == null) { - column = newMemColumn(path); - columns.put(path, column); - } - return column; - } - - public Set getColumnDescriptors() { - return columns.keySet(); - } - - private ColumnWriterV1 newMemColumn(ColumnDescriptor path) { - PageWriter pageWriter = pageWriteStore.getPageWriter(path); - - if (props.isBloomFilterEnabled() && props.getBloomFilterExpectValues() != null) { - BloomFilterWriter bloomFilterWriter = bloomFilterWriteStore.getBloomFilterWriter(path); - return new ColumnWriterV1(path, pageWriter, bloomFilterWriter, props); - } else { - return new ColumnWriterV1(path, pageWriter, props); - } - } - - @Override - public String toString() { - StringBuilder sb = new StringBuilder(); - for (Entry entry : columns.entrySet()) { - sb.append(Arrays.toString(entry.getKey().getPath())).append(": "); - sb.append(entry.getValue().getBufferedSizeInMemory()).append(" bytes"); - sb.append("\n"); - } - return sb.toString(); + super (schema, pageWriteStore, bloomFilterWriteStore, props); } @Override - public long getAllocatedSize() { - Collection values = columns.values(); - long total = 0; - for (ColumnWriterV1 memColumn : values) { - total += memColumn.allocatedSize(); - } - return total; + ColumnWriterBase createColumnWriter(ColumnDescriptor path, PageWriter pageWriter, + BloomFilterWriter bloomFilterWriter, ParquetProperties props) { + return new ColumnWriterV1(path, pageWriter, bloomFilterWriter, props); } - - @Override - public long getBufferedSize() { - Collection values = columns.values(); - long total = 0; - for (ColumnWriterV1 memColumn : values) { - total += memColumn.getBufferedSizeInMemory(); - } - return total; - } - - @Override - public String memUsageString() { - StringBuilder b = new StringBuilder("Store {\n"); - Collection values = columns.values(); - for (ColumnWriterV1 memColumn : values) { - b.append(memColumn.memUsageString(" ")); - } - b.append("}\n"); - return b.toString(); - } - - public long maxColMemSize() { - Collection values = columns.values(); - long max = 0; - for (ColumnWriterV1 memColumn : values) { - max = Math.max(max, memColumn.getBufferedSizeInMemory()); - } - return max; - } - - @Override - public void flush() { - Collection values = columns.values(); - for (ColumnWriterV1 memColumn : values) { - memColumn.flush(); - } - } - - @Override - public void endRecord() { - // V1 does not take record boundaries into account - } - - public void close() { - Collection values = columns.values(); - for (ColumnWriterV1 memColumn : values) { - memColumn.close(); - } - } - } diff --git a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriteStoreV2.java b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriteStoreV2.java index 6c20b8bb87..a9f2d5848d 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriteStoreV2.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriteStoreV2.java @@ -18,20 +18,7 @@ */ package org.apache.parquet.column.impl; -import static java.lang.Math.max; -import static java.lang.Math.min; -import static java.util.Collections.unmodifiableMap; - -import java.util.Arrays; -import java.util.Collection; -import java.util.Map; -import java.util.Map.Entry; -import java.util.Set; -import java.util.TreeMap; - import org.apache.parquet.column.ColumnDescriptor; -import org.apache.parquet.column.ColumnWriteStore; -import org.apache.parquet.column.ColumnWriter; import org.apache.parquet.column.ParquetProperties; import org.apache.parquet.column.page.PageWriteStore; import org.apache.parquet.column.page.PageWriter; @@ -39,163 +26,20 @@ import org.apache.parquet.column.values.bloomfilter.BloomFilterWriter; import org.apache.parquet.schema.MessageType; -public class ColumnWriteStoreV2 implements ColumnWriteStore { - - // will flush even if size bellow the threshold by this much to facilitate page alignment - private static final float THRESHOLD_TOLERANCE_RATIO = 0.1f; // 10 % - - private final Map columns; - private final Collection writers; - private final ParquetProperties props; - private final long thresholdTolerance; - private long rowCount; - private long rowCountForNextSizeCheck; - - public ColumnWriteStoreV2( - MessageType schema, - PageWriteStore pageWriteStore, - ParquetProperties props) { - this.props = props; - this.thresholdTolerance = (long)(props.getPageSizeThreshold() * THRESHOLD_TOLERANCE_RATIO); - Map mcolumns = new TreeMap(); - for (ColumnDescriptor path : schema.getColumns()) { - PageWriter pageWriter = pageWriteStore.getPageWriter(path); - mcolumns.put(path, new ColumnWriterV2(path, pageWriter, props)); - } - this.columns = unmodifiableMap(mcolumns); - this.writers = this.columns.values(); - - this.rowCountForNextSizeCheck = props.getMinRowCountForPageSizeCheck(); +public class ColumnWriteStoreV2 extends ColumnWriteStoreBase { + public ColumnWriteStoreV2(MessageType schema, PageWriteStore pageWriteStore, ParquetProperties props) { + super(schema, pageWriteStore, props); } - public ColumnWriteStoreV2( - MessageType schema, - PageWriteStore pageWriteStore, - BloomFilterWriteStore bloomFilterWriteStore, - ParquetProperties props) { - this.props = props; - this.thresholdTolerance = (long)(props.getPageSizeThreshold() * THRESHOLD_TOLERANCE_RATIO); - Map mcolumns = new TreeMap(); - - for (ColumnDescriptor path : schema.getColumns()) { - PageWriter pageWriter = pageWriteStore.getPageWriter(path); - if (props.isBloomFilterEnabled() && props.getBloomFilterExpectValues() != null) { - BloomFilterWriter bloomFilterWriter = bloomFilterWriteStore.getBloomFilterWriter(path); - mcolumns.put(path, new ColumnWriterV2(path, pageWriter, bloomFilterWriter, props)); - } else { - mcolumns.put(path, new ColumnWriterV2(path, pageWriter, props)); - } - } - this.columns = unmodifiableMap(mcolumns); - this.writers = this.columns.values(); - - this.rowCountForNextSizeCheck = props.getMinRowCountForPageSizeCheck(); - } - - public ColumnWriter getColumnWriter(ColumnDescriptor path) { - return columns.get(path); - } - - public Set getColumnDescriptors() { - return columns.keySet(); - } - - @Override - public String toString() { - StringBuilder sb = new StringBuilder(); - for (Entry entry : columns.entrySet()) { - sb.append(Arrays.toString(entry.getKey().getPath())).append(": "); - sb.append(entry.getValue().getTotalBufferedSize()).append(" bytes"); - sb.append("\n"); - } - return sb.toString(); + public ColumnWriteStoreV2(MessageType schema, PageWriteStore pageWriteStore, + BloomFilterWriteStore bloomFilterWriteStore, + ParquetProperties props) { + super(schema, pageWriteStore, bloomFilterWriteStore, props); } @Override - public long getAllocatedSize() { - long total = 0; - for (ColumnWriterV2 memColumn : columns.values()) { - total += memColumn.allocatedSize(); - } - return total; + ColumnWriterBase createColumnWriter(ColumnDescriptor path, PageWriter pageWriter, + BloomFilterWriter bloomFilterWriter, ParquetProperties props) { + return new ColumnWriterV2(path, pageWriter, bloomFilterWriter, props); } - - @Override - public long getBufferedSize() { - long total = 0; - for (ColumnWriterV2 memColumn : columns.values()) { - total += memColumn.getTotalBufferedSize(); - } - return total; - } - - @Override - public void flush() { - for (ColumnWriterV2 memColumn : columns.values()) { - long rows = rowCount - memColumn.getRowsWrittenSoFar(); - if (rows > 0) { - memColumn.writePage(rowCount); - } - memColumn.finalizeColumnChunk(); - } - } - - public String memUsageString() { - StringBuilder b = new StringBuilder("Store {\n"); - for (ColumnWriterV2 memColumn : columns.values()) { - b.append(memColumn.memUsageString(" ")); - } - b.append("}\n"); - return b.toString(); - } - - @Override - public void close() { - flush(); // calling flush() here to keep it consistent with the behavior before merging with master - for (ColumnWriterV2 memColumn : columns.values()) { - memColumn.close(); - } - } - - @Override - public void endRecord() { - ++ rowCount; - if (rowCount >= rowCountForNextSizeCheck) { - sizeCheck(); - } - } - - private void sizeCheck() { - long minRecordToWait = Long.MAX_VALUE; - for (ColumnWriterV2 writer : writers) { - long usedMem = writer.getCurrentPageBufferedSize(); - long rows = rowCount - writer.getRowsWrittenSoFar(); - long remainingMem = props.getPageSizeThreshold() - usedMem; - if (remainingMem <= thresholdTolerance) { - writer.writePage(rowCount); - remainingMem = props.getPageSizeThreshold(); - } - long rowsToFillPage = - usedMem == 0 ? - props.getMaxRowCountForPageSizeCheck() - : (long)((float)rows) / usedMem * remainingMem; - if (rowsToFillPage < minRecordToWait) { - minRecordToWait = rowsToFillPage; - } - } - if (minRecordToWait == Long.MAX_VALUE) { - minRecordToWait = props.getMinRowCountForPageSizeCheck(); - } - - if(props.estimateNextSizeCheck()) { - // will check again halfway if between min and max - rowCountForNextSizeCheck = rowCount + - min( - max(minRecordToWait / 2, props.getMinRowCountForPageSizeCheck()), - props.getMaxRowCountForPageSizeCheck()); - } else { - rowCountForNextSizeCheck = rowCount + props.getMinRowCountForPageSizeCheck(); - } - } - } diff --git a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterBase.java b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterBase.java new file mode 100644 index 0000000000..7f9ba4d868 --- /dev/null +++ b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterBase.java @@ -0,0 +1,400 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.column.impl; + +import java.io.IOException; +import java.util.HashMap; + +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.column.ColumnWriter; +import org.apache.parquet.column.ParquetProperties; +import org.apache.parquet.column.page.DictionaryPage; +import org.apache.parquet.column.page.PageWriter; +import org.apache.parquet.column.statistics.Statistics; +import org.apache.parquet.column.values.ValuesWriter; +import org.apache.parquet.column.values.bloomfilter.BlockSplitBloomFilter; +import org.apache.parquet.column.values.bloomfilter.BloomFilter; +import org.apache.parquet.column.values.bloomfilter.BloomFilterWriter; +import org.apache.parquet.io.ParquetEncodingException; +import org.apache.parquet.io.api.Binary; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Base implementation for {@link ColumnWriter} to be extended to specialize for V1 and V2 pages. + */ +abstract class ColumnWriterBase implements ColumnWriter { + private static final Logger LOG = LoggerFactory.getLogger(ColumnWriterBase.class); + + // By default: Debugging disabled this way (using the "if (DEBUG)" IN the methods) to allow + // the java compiler (not the JIT) to remove the unused statements during build time. + private static final boolean DEBUG = false; + + final ColumnDescriptor path; + final PageWriter pageWriter; + private ValuesWriter repetitionLevelColumn; + private ValuesWriter definitionLevelColumn; + private ValuesWriter dataColumn; + private int valueCount; + + private Statistics statistics; + private long rowsWrittenSoFar = 0; + private int pageRowCount; + + private BloomFilterWriter bloomFilterWriter; + private BloomFilter bloomFilter; + + ColumnWriterBase( + ColumnDescriptor path, + PageWriter pageWriter, + ParquetProperties props) { + this.path = path; + this.pageWriter = pageWriter; + resetStatistics(); + + this.repetitionLevelColumn = createRLWriter(props, path); + this.definitionLevelColumn = createDLWriter(props, path); + this.dataColumn = props.newValuesWriter(path); + } + + ColumnWriterBase( + ColumnDescriptor path, + PageWriter pageWriter, + BloomFilterWriter bloomFilterWriter, + ParquetProperties props + ) { + this(path, pageWriter, props); + + // Current not support nested column. + if (path.getPath().length != 1 || bloomFilterWriter == null) { + return; + } + + this.bloomFilterWriter = bloomFilterWriter; + HashMap bloomFilterExpectValues = props.getBloomFilterExpectedDistinctNumbers(); + String column = path.getPath()[0]; + if (bloomFilterExpectValues.keySet().contains(column)) { + int optimalNumOfBits = BlockSplitBloomFilter.optimalNumOfBits(bloomFilterExpectValues.get(column).intValue(), + BlockSplitBloomFilter.DEFAULT_FPP); + this.bloomFilter = new BlockSplitBloomFilter(optimalNumOfBits/8); + } + } + + abstract ValuesWriter createRLWriter(ParquetProperties props, ColumnDescriptor path); + + abstract ValuesWriter createDLWriter(ParquetProperties props, ColumnDescriptor path); + + private void log(Object value, int r, int d) { + LOG.debug("{} {} r:{} d:{}", path, value, r, d); + } + + private void resetStatistics() { + this.statistics = Statistics.createStats(path.getPrimitiveType()); + } + + private void definitionLevel(int definitionLevel) { + definitionLevelColumn.writeInteger(definitionLevel); + } + + private void repetitionLevel(int repetitionLevel) { + repetitionLevelColumn.writeInteger(repetitionLevel); + assert pageRowCount == 0 ? repetitionLevel == 0 : true : "Every page shall start on record boundaries"; + if (repetitionLevel == 0) { + ++pageRowCount; + } + } + + /** + * Writes the current null value + * + * @param repetitionLevel + * @param definitionLevel + */ + @Override + public void writeNull(int repetitionLevel, int definitionLevel) { + if (DEBUG) + log(null, repetitionLevel, definitionLevel); + repetitionLevel(repetitionLevel); + definitionLevel(definitionLevel); + statistics.incrementNumNulls(); + ++valueCount; + } + + @Override + public void close() { + // Close the Values writers. + repetitionLevelColumn.close(); + definitionLevelColumn.close(); + dataColumn.close(); + } + + @Override + public long getBufferedSizeInMemory() { + return repetitionLevelColumn.getBufferedSize() + + definitionLevelColumn.getBufferedSize() + + dataColumn.getBufferedSize() + + pageWriter.getMemSize(); + } + + + private void updateBloomFilter(int value) { + if (bloomFilter != null) { + bloomFilter.insert(bloomFilter.hash(value)); + } + } + + private void updateBloomFilter(long value) { + if (bloomFilter != null) { + bloomFilter.insert(bloomFilter.hash(value)); + } + } + + private void updateBloomFilter(double value) { + if (bloomFilter != null) { + bloomFilter.insert(bloomFilter.hash(value)); + } + } + + private void updateBloomFilter(float value) { + if (bloomFilter != null) { + bloomFilter.insert(bloomFilter.hash(value)); + } + } + + private void updateBloomFilter(Binary value) { + if (bloomFilter != null) { + bloomFilter.insert(bloomFilter.hash(value)); + } + } + + /** + * Writes the current value + * + * @param value + * @param repetitionLevel + * @param definitionLevel + */ + @Override + public void write(double value, int repetitionLevel, int definitionLevel) { + if (DEBUG) + log(value, repetitionLevel, definitionLevel); + repetitionLevel(repetitionLevel); + definitionLevel(definitionLevel); + dataColumn.writeDouble(value); + statistics.updateStats(value); + updateBloomFilter(value); + ++valueCount; + } + + /** + * Writes the current value + * + * @param value + * @param repetitionLevel + * @param definitionLevel + */ + @Override + public void write(float value, int repetitionLevel, int definitionLevel) { + if (DEBUG) + log(value, repetitionLevel, definitionLevel); + repetitionLevel(repetitionLevel); + definitionLevel(definitionLevel); + dataColumn.writeFloat(value); + statistics.updateStats(value); + updateBloomFilter(value); + ++valueCount; + } + + /** + * Writes the current value + * + * @param value + * @param repetitionLevel + * @param definitionLevel + */ + @Override + public void write(Binary value, int repetitionLevel, int definitionLevel) { + if (DEBUG) + log(value, repetitionLevel, definitionLevel); + repetitionLevel(repetitionLevel); + definitionLevel(definitionLevel); + dataColumn.writeBytes(value); + statistics.updateStats(value); + updateBloomFilter(value); + ++valueCount; + } + + /** + * Writes the current value + * + * @param value + * @param repetitionLevel + * @param definitionLevel + */ + @Override + public void write(boolean value, int repetitionLevel, int definitionLevel) { + if (DEBUG) + log(value, repetitionLevel, definitionLevel); + repetitionLevel(repetitionLevel); + definitionLevel(definitionLevel); + dataColumn.writeBoolean(value); + statistics.updateStats(value); + ++valueCount; + } + + /** + * Writes the current value + * + * @param value + * @param repetitionLevel + * @param definitionLevel + */ + @Override + public void write(int value, int repetitionLevel, int definitionLevel) { + if (DEBUG) + log(value, repetitionLevel, definitionLevel); + repetitionLevel(repetitionLevel); + definitionLevel(definitionLevel); + dataColumn.writeInteger(value); + statistics.updateStats(value); + updateBloomFilter(value); + ++valueCount; + } + + /** + * Writes the current value + * + * @param value + * @param repetitionLevel + * @param definitionLevel + */ + @Override + public void write(long value, int repetitionLevel, int definitionLevel) { + if (DEBUG) + log(value, repetitionLevel, definitionLevel); + repetitionLevel(repetitionLevel); + definitionLevel(definitionLevel); + dataColumn.writeLong(value); + statistics.updateStats(value); + updateBloomFilter(value); + ++valueCount; + } + + /** + * Finalizes the Column chunk. Possibly adding extra pages if needed (dictionary, ...) + * Is called right after writePage + */ + void finalizeColumnChunk() { + final DictionaryPage dictionaryPage = dataColumn.toDictPageAndClose(); + if (dictionaryPage != null) { + if (DEBUG) + LOG.debug("write dictionary"); + try { + pageWriter.writeDictionaryPage(dictionaryPage); + } catch (IOException e) { + throw new ParquetEncodingException("could not write dictionary page for " + path, e); + } + dataColumn.resetDictionary(); + } + + if (bloomFilterWriter != null && bloomFilter != null) { + bloomFilterWriter.writeBloomFilter(bloomFilter); + } + } + + /** + * Used to decide when to write a page + * + * @return the number of bytes of memory used to buffer the current data + */ + long getCurrentPageBufferedSize() { + return repetitionLevelColumn.getBufferedSize() + + definitionLevelColumn.getBufferedSize() + + dataColumn.getBufferedSize(); + } + + /** + * Used to decide when to write a page or row group + * + * @return the number of bytes of memory used to buffer the current data and the previously written pages + */ + long getTotalBufferedSize() { + long bloomBufferSize = bloomFilter == null ? 0 : bloomFilter.getBitsetSize(); + return repetitionLevelColumn.getBufferedSize() + + definitionLevelColumn.getBufferedSize() + + dataColumn.getBufferedSize() + + pageWriter.getMemSize() + + bloomBufferSize; + } + + /** + * @return actual memory used + */ + long allocatedSize() { + long bloomAllocatedSize = bloomFilter == null ? 0 : bloomFilter.getBitsetSize(); + return repetitionLevelColumn.getAllocatedSize() + + definitionLevelColumn.getAllocatedSize() + + dataColumn.getAllocatedSize() + + pageWriter.allocatedSize() + + bloomAllocatedSize; + } + + /** + * @param indent + * a prefix to format lines + * @return a formatted string showing how memory is used + */ + String memUsageString(String indent) { + StringBuilder b = new StringBuilder(indent).append(path).append(" {\n"); + b.append(indent).append(" r:").append(repetitionLevelColumn.getAllocatedSize()).append(" bytes\n"); + b.append(indent).append(" d:").append(definitionLevelColumn.getAllocatedSize()).append(" bytes\n"); + b.append(dataColumn.memUsageString(indent + " data:")).append("\n"); + b.append(pageWriter.memUsageString(indent + " pages:")).append("\n"); + b.append(indent).append(String.format(" total: %,d/%,d", getTotalBufferedSize(), allocatedSize())).append("\n"); + b.append(indent).append("}\n"); + return b.toString(); + } + + long getRowsWrittenSoFar() { + return this.rowsWrittenSoFar; + } + + /** + * Writes the current data to a new page in the page store + */ + void writePage() { + this.rowsWrittenSoFar += pageRowCount; + if (DEBUG) + LOG.debug("write page"); + try { + writePage(pageRowCount, valueCount, statistics, repetitionLevelColumn, definitionLevelColumn, dataColumn); + } catch (IOException e) { + throw new ParquetEncodingException("could not write page for " + path, e); + } + repetitionLevelColumn.reset(); + definitionLevelColumn.reset(); + dataColumn.reset(); + valueCount = 0; + resetStatistics(); + pageRowCount = 0; + } + + abstract void writePage(int rowCount, int valueCount, Statistics statistics, ValuesWriter repetitionLevels, + ValuesWriter definitionLevels, ValuesWriter values) throws IOException; +} diff --git a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV1.java b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV1.java index c5fc9dc549..1d732b837d 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV1.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV1.java @@ -21,332 +21,47 @@ import static org.apache.parquet.bytes.BytesInput.concat; import java.io.IOException; -import java.util.HashMap; import org.apache.parquet.column.ColumnDescriptor; -import org.apache.parquet.column.ColumnWriter; import org.apache.parquet.column.ParquetProperties; -import org.apache.parquet.column.page.DictionaryPage; import org.apache.parquet.column.page.PageWriter; import org.apache.parquet.column.statistics.Statistics; import org.apache.parquet.column.values.ValuesWriter; -import org.apache.parquet.column.values.bloomfilter.BlockSplitBloomFilter; -import org.apache.parquet.column.values.bloomfilter.BloomFilter; import org.apache.parquet.column.values.bloomfilter.BloomFilterWriter; -import org.apache.parquet.io.ParquetEncodingException; -import org.apache.parquet.io.api.Binary; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; /** * Writes (repetition level, definition level, value) triplets and deals with writing pages to the underlying layer. */ -final class ColumnWriterV1 implements ColumnWriter { - private static final Logger LOG = LoggerFactory.getLogger(ColumnWriterV1.class); - - // By default: Debugging disabled this way (using the "if (DEBUG)" IN the methods) to allow - // the java compiler (not the JIT) to remove the unused statements during build time. - private static final boolean DEBUG = false; - - private final ColumnDescriptor path; - private final PageWriter pageWriter; - private final ParquetProperties props; - - private ValuesWriter repetitionLevelColumn; - private ValuesWriter definitionLevelColumn; - private ValuesWriter dataColumn; - private int valueCount; - private int valueCountForNextSizeCheck; - - private Statistics statistics; - private BloomFilterWriter bloomFilterWriter; - private BloomFilter bloomFilter; - - public ColumnWriterV1(ColumnDescriptor path, PageWriter pageWriter, - BloomFilterWriter bloomFilterWriter, ParquetProperties props) { - this(path, pageWriter, props); - - // Current not support nested column. - if (path.getPath().length != 1 || bloomFilterWriter == null) { - return; - } - - this.bloomFilterWriter = bloomFilterWriter; - HashMap bloomFilterExpectValues = props.getBloomFilterExpectValues(); - String column = path.getPath()[0]; - if (bloomFilterExpectValues.keySet().contains(column)) { - int optimalNumOfBits = BlockSplitBloomFilter.optimalNumOfBits(bloomFilterExpectValues.get(column).intValue(), - BlockSplitBloomFilter.DEFAULT_FPP); - this.bloomFilter = new BlockSplitBloomFilter(optimalNumOfBits/8); - } - +final class ColumnWriterV1 extends ColumnWriterBase { + ColumnWriterV1(ColumnDescriptor path, PageWriter pageWriter, ParquetProperties props) { + super(path, pageWriter, props); } public ColumnWriterV1(ColumnDescriptor path, PageWriter pageWriter, - ParquetProperties props) { - this.path = path; - this.pageWriter = pageWriter; - this.props = props; - - // initial check of memory usage. So that we have enough data to make an initial prediction - this.valueCountForNextSizeCheck = props.getMinRowCountForPageSizeCheck(); - - resetStatistics(); - - this.repetitionLevelColumn = props.newRepetitionLevelWriter(path); - this.definitionLevelColumn = props.newDefinitionLevelWriter(path); - this.dataColumn = props.newValuesWriter(path); - } - - private void log(Object value, int r, int d) { - if (DEBUG) LOG.debug( "{} {} r:{} d:{}", path, value, r, d); - } - - private void resetStatistics() { - this.statistics = Statistics.createStats(this.path.getPrimitiveType()); - } - - /** - * Counts how many values have been written and checks the memory usage to flush the page when we reach the page threshold. - * - * We measure the memory used when we reach the mid point toward our estimated count. - * We then update the estimate and flush the page if we reached the threshold. - * - * That way we check the memory size log2(n) times. - * - */ - private void accountForValueWritten() { - ++ valueCount; - if (valueCount > valueCountForNextSizeCheck) { - // not checking the memory used for every value - long memSize = repetitionLevelColumn.getBufferedSize() - + definitionLevelColumn.getBufferedSize() - + dataColumn.getBufferedSize(); - if (memSize > props.getPageSizeThreshold()) { - // we will write the current page and check again the size at the predicted middle of next page - if (props.estimateNextSizeCheck()) { - valueCountForNextSizeCheck = valueCount / 2; - } else { - valueCountForNextSizeCheck = props.getMinRowCountForPageSizeCheck(); - } - writePage(); - } else if (props.estimateNextSizeCheck()) { - // not reached the threshold, will check again midway - valueCountForNextSizeCheck = (int)(valueCount + ((float)valueCount * props.getPageSizeThreshold() / memSize)) / 2 + 1; - } else { - valueCountForNextSizeCheck += props.getMinRowCountForPageSizeCheck(); - } - } - } - - private void updateStatisticsNumNulls() { - statistics.incrementNumNulls(); - } - - private void updateStatistics(int value) { - statistics.updateStats(value); - } - - private void updateStatistics(long value) { - statistics.updateStats(value); - } - - private void updateStatistics(float value) { - statistics.updateStats(value); - } - - private void updateStatistics(double value) { - statistics.updateStats(value); - } - - private void updateStatistics(Binary value) { - statistics.updateStats(value); - } - - private void updateStatistics(boolean value) { - statistics.updateStats(value); - } - - private void updateBloomFilter(int value) { - if (bloomFilter != null) { - bloomFilter.insert(bloomFilter.hash(value)); - } - } - - private void updateBloomFilter(long value) { - if (bloomFilter != null) { - bloomFilter.insert(bloomFilter.hash(value)); - } - } - - private void updateBloomFilter(double value) { - if (bloomFilter != null) { - bloomFilter.insert(bloomFilter.hash(value)); - } - } - - private void updateBloomFilter(float value) { - if (bloomFilter != null) { - bloomFilter.insert(bloomFilter.hash(value)); - } - } - - private void updateBloomFilter(Binary value) { - if (bloomFilter != null) { - bloomFilter.insert(bloomFilter.hash(value)); - } - } - - private void writePage() { - if (DEBUG) LOG.debug("write page"); - try { - pageWriter.writePage( - concat(repetitionLevelColumn.getBytes(), definitionLevelColumn.getBytes(), dataColumn.getBytes()), - valueCount, - statistics, - repetitionLevelColumn.getEncoding(), - definitionLevelColumn.getEncoding(), - dataColumn.getEncoding()); - } catch (IOException e) { - throw new ParquetEncodingException("could not write page for " + path, e); - } - repetitionLevelColumn.reset(); - definitionLevelColumn.reset(); - dataColumn.reset(); - valueCount = 0; - resetStatistics(); - } - - @Override - public void writeNull(int repetitionLevel, int definitionLevel) { - if (DEBUG) log(null, repetitionLevel, definitionLevel); - repetitionLevelColumn.writeInteger(repetitionLevel); - definitionLevelColumn.writeInteger(definitionLevel); - updateStatisticsNumNulls(); - accountForValueWritten(); - } - - @Override - public void write(double value, int repetitionLevel, int definitionLevel) { - if (DEBUG) log(value, repetitionLevel, definitionLevel); - repetitionLevelColumn.writeInteger(repetitionLevel); - definitionLevelColumn.writeInteger(definitionLevel); - dataColumn.writeDouble(value); - updateStatistics(value); - updateBloomFilter(value); - accountForValueWritten(); - } - - @Override - public void write(float value, int repetitionLevel, int definitionLevel) { - if (DEBUG) log(value, repetitionLevel, definitionLevel); - repetitionLevelColumn.writeInteger(repetitionLevel); - definitionLevelColumn.writeInteger(definitionLevel); - dataColumn.writeFloat(value); - updateStatistics(value); - updateBloomFilter(value); - accountForValueWritten(); - } - - @Override - public void write(Binary value, int repetitionLevel, int definitionLevel) { - if (DEBUG) log(value, repetitionLevel, definitionLevel); - repetitionLevelColumn.writeInteger(repetitionLevel); - definitionLevelColumn.writeInteger(definitionLevel); - dataColumn.writeBytes(value); - updateStatistics(value); - updateBloomFilter(value); - accountForValueWritten(); - } - - @Override - public void write(boolean value, int repetitionLevel, int definitionLevel) { - if (DEBUG) log(value, repetitionLevel, definitionLevel); - repetitionLevelColumn.writeInteger(repetitionLevel); - definitionLevelColumn.writeInteger(definitionLevel); - dataColumn.writeBoolean(value); - updateStatistics(value); - accountForValueWritten(); - } - - @Override - public void write(int value, int repetitionLevel, int definitionLevel) { - if (DEBUG) log(value, repetitionLevel, definitionLevel); - repetitionLevelColumn.writeInteger(repetitionLevel); - definitionLevelColumn.writeInteger(definitionLevel); - dataColumn.writeInteger(value); - updateStatistics(value); - updateBloomFilter(value); - accountForValueWritten(); + BloomFilterWriter bloomFilterWriter, ParquetProperties props) { + super(path, pageWriter, bloomFilterWriter, props); } @Override - public void write(long value, int repetitionLevel, int definitionLevel) { - if (DEBUG) log(value, repetitionLevel, definitionLevel); - repetitionLevelColumn.writeInteger(repetitionLevel); - definitionLevelColumn.writeInteger(definitionLevel); - dataColumn.writeLong(value); - updateStatistics(value); - updateBloomFilter(value); - accountForValueWritten(); - } - - public void flush() { - if (valueCount > 0) { - writePage(); - } - final DictionaryPage dictionaryPage = dataColumn.toDictPageAndClose(); - if (dictionaryPage != null) { - if (DEBUG) LOG.debug("write dictionary"); - try { - pageWriter.writeDictionaryPage(dictionaryPage); - } catch (IOException e) { - throw new ParquetEncodingException("could not write dictionary page for " + path, e); - } - dataColumn.resetDictionary(); - } - - if (bloomFilterWriter != null && bloomFilter != null) { - bloomFilterWriter.writeBloomFilter(bloomFilter); - } + ValuesWriter createRLWriter(ParquetProperties props, ColumnDescriptor path) { + return props.newRepetitionLevelWriter(path); } @Override - public void close() { - flush(); - // Close the Values writers. - repetitionLevelColumn.close(); - definitionLevelColumn.close(); - dataColumn.close(); + ValuesWriter createDLWriter(ParquetProperties props, ColumnDescriptor path) { + return props.newDefinitionLevelWriter(path); } @Override - public long getBufferedSizeInMemory() { - long bloomBufferSize = bloomFilter == null ? 0 : bloomFilter.getBitsetSize(); - return repetitionLevelColumn.getBufferedSize() - + definitionLevelColumn.getBufferedSize() - + dataColumn.getBufferedSize() - + pageWriter.getMemSize() - + bloomBufferSize; - } - - public long allocatedSize() { - long bloomAllocatedSize = bloomFilter == null ? 0 : bloomFilter.getBitsetSize(); - return repetitionLevelColumn.getAllocatedSize() - + definitionLevelColumn.getAllocatedSize() - + dataColumn.getAllocatedSize() - + pageWriter.allocatedSize() - + bloomAllocatedSize; - } - - public String memUsageString(String indent) { - StringBuilder b = new StringBuilder(indent).append(path).append(" {\n"); - b.append(repetitionLevelColumn.memUsageString(indent + " r:")).append("\n"); - b.append(definitionLevelColumn.memUsageString(indent + " d:")).append("\n"); - b.append(dataColumn.memUsageString(indent + " data:")).append("\n"); - b.append(pageWriter.memUsageString(indent + " pages:")).append("\n"); - b.append(indent).append(String.format(" total: %,d/%,d", getBufferedSizeInMemory(), allocatedSize())).append("\n"); - b.append(indent).append("}\n"); - return b.toString(); + void writePage(int rowCount, int valueCount, Statistics statistics, ValuesWriter repetitionLevels, + ValuesWriter definitionLevels, ValuesWriter values) throws IOException { + pageWriter.writePage( + concat(repetitionLevels.getBytes(), definitionLevels.getBytes(), values.getBytes()), + valueCount, + rowCount, + statistics, + repetitionLevels.getEncoding(), + definitionLevels.getEncoding(), + values.getEncoding()); } } diff --git a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV2.java b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV2.java index 7b1671407a..ad7077bd19 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV2.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV2.java @@ -19,349 +19,77 @@ package org.apache.parquet.column.impl; import java.io.IOException; -import java.util.HashMap; import org.apache.parquet.Ints; import org.apache.parquet.bytes.BytesInput; import org.apache.parquet.column.ColumnDescriptor; -import org.apache.parquet.column.ColumnWriter; import org.apache.parquet.column.Encoding; import org.apache.parquet.column.ParquetProperties; -import org.apache.parquet.column.page.DictionaryPage; import org.apache.parquet.column.page.PageWriter; import org.apache.parquet.column.statistics.Statistics; import org.apache.parquet.column.values.ValuesWriter; -import org.apache.parquet.column.values.bloomfilter.BlockSplitBloomFilter; -import org.apache.parquet.column.values.bloomfilter.BloomFilter; +import org.apache.parquet.column.values.bitpacking.DevNullValuesWriter; import org.apache.parquet.column.values.bloomfilter.BloomFilterWriter; import org.apache.parquet.column.values.rle.RunLengthBitPackingHybridEncoder; +import org.apache.parquet.column.values.rle.RunLengthBitPackingHybridValuesWriter; import org.apache.parquet.io.ParquetEncodingException; -import org.apache.parquet.io.api.Binary; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; /** * Writes (repetition level, definition level, value) triplets and deals with writing pages to the underlying layer. */ -final class ColumnWriterV2 implements ColumnWriter { - private static final Logger LOG = LoggerFactory.getLogger(ColumnWriterV2.class); +final class ColumnWriterV2 extends ColumnWriterBase { - // By default: Debugging disabled this way (using the "if (DEBUG)" IN the methods) to allow - // the java compiler (not the JIT) to remove the unused statements during build time. - private static final boolean DEBUG = false; - - private final ColumnDescriptor path; - private final PageWriter pageWriter; - private RunLengthBitPackingHybridEncoder repetitionLevelColumn; - private RunLengthBitPackingHybridEncoder definitionLevelColumn; - private ValuesWriter dataColumn; - private int valueCount; - - private BloomFilterWriter bloomFilterWriter; - private BloomFilter bloomFilter; - - private Statistics statistics; - private long rowsWrittenSoFar = 0; - - public ColumnWriterV2( - ColumnDescriptor path, - PageWriter pageWriter, - ParquetProperties props) { - this.path = path; - this.pageWriter = pageWriter; - resetStatistics(); - - this.repetitionLevelColumn = props.newRepetitionLevelEncoder(path); - this.definitionLevelColumn = props.newDefinitionLevelEncoder(path); - this.dataColumn = props.newValuesWriter(path); - } - - public ColumnWriterV2( - ColumnDescriptor path, - PageWriter pageWriter, - BloomFilterWriter bloomFilterWriter, - ParquetProperties props) { - this(path, pageWriter, props); - - // Current not support nested column. - if (path.getPath().length != 1 || bloomFilterWriter == null) { - return; - } - - this.bloomFilterWriter = bloomFilterWriter; - HashMap bloomFilterExpectValues = props.getBloomFilterExpectValues(); - String column = path.getPath()[0]; - if (bloomFilterExpectValues.keySet().contains(column)) { - int optimalNumOfBits = BlockSplitBloomFilter.optimalNumOfBits(bloomFilterExpectValues.get(column).intValue(), - BlockSplitBloomFilter.DEFAULT_FPP); - this.bloomFilter = new BlockSplitBloomFilter(optimalNumOfBits/8); + // Extending the original implementation to not to write the size of the data as the original writer would + private static class RLEWriterForV2 extends RunLengthBitPackingHybridValuesWriter { + public RLEWriterForV2(RunLengthBitPackingHybridEncoder encoder) { + super(encoder); } - } - private void log(Object value, int r, int d) { - LOG.debug("{} {} r:{} d:{}", path, value, r, d); - } - - private void resetStatistics() { - this.statistics = Statistics.createStats(path.getPrimitiveType()); - } - - private void definitionLevel(int definitionLevel) { - try { - definitionLevelColumn.writeInt(definitionLevel); - } catch (IOException e) { - throw new ParquetEncodingException("illegal definition level " + definitionLevel + " for column " + path, e); - } - } - - private void repetitionLevel(int repetitionLevel) { - try { - repetitionLevelColumn.writeInt(repetitionLevel); - } catch (IOException e) { - throw new ParquetEncodingException("illegal repetition level " + repetitionLevel + " for column " + path, e); - } - } - - /** - * writes the current null value - * @param repetitionLevel - * @param definitionLevel - */ - public void writeNull(int repetitionLevel, int definitionLevel) { - if (DEBUG) log(null, repetitionLevel, definitionLevel); - repetitionLevel(repetitionLevel); - definitionLevel(definitionLevel); - statistics.incrementNumNulls(); - ++ valueCount; - } - - @Override - public void close() { - // Close the Values writers. - repetitionLevelColumn.close(); - definitionLevelColumn.close(); - dataColumn.close(); - } - - @Override - public long getBufferedSizeInMemory() { - return repetitionLevelColumn.getBufferedSize() - + definitionLevelColumn.getBufferedSize() - + dataColumn.getBufferedSize() - + pageWriter.getMemSize(); - } - - /** - * writes the current value - * @param value - * @param repetitionLevel - * @param definitionLevel - */ - public void write(double value, int repetitionLevel, int definitionLevel) { - if (DEBUG) log(value, repetitionLevel, definitionLevel); - repetitionLevel(repetitionLevel); - definitionLevel(definitionLevel); - dataColumn.writeDouble(value); - statistics.updateStats(value); - if (bloomFilter != null) { - bloomFilter.insert(bloomFilter.hash(value)); - } - ++ valueCount; - } - - /** - * writes the current value - * @param value - * @param repetitionLevel - * @param definitionLevel - */ - public void write(float value, int repetitionLevel, int definitionLevel) { - if (DEBUG) log(value, repetitionLevel, definitionLevel); - repetitionLevel(repetitionLevel); - definitionLevel(definitionLevel); - dataColumn.writeFloat(value); - statistics.updateStats(value); - if (bloomFilter != null) { - bloomFilter.insert(bloomFilter.hash(value)); - } - ++ valueCount; - } - - /** - * writes the current value - * @param value - * @param repetitionLevel - * @param definitionLevel - */ - public void write(Binary value, int repetitionLevel, int definitionLevel) { - if (DEBUG) log(value, repetitionLevel, definitionLevel); - repetitionLevel(repetitionLevel); - definitionLevel(definitionLevel); - dataColumn.writeBytes(value); - statistics.updateStats(value); - if (bloomFilter != null) { - bloomFilter.insert(bloomFilter.hash(value)); - } - ++ valueCount; - } - - /** - * writes the current value - * @param value - * @param repetitionLevel - * @param definitionLevel - */ - public void write(boolean value, int repetitionLevel, int definitionLevel) { - if (DEBUG) log(value, repetitionLevel, definitionLevel); - repetitionLevel(repetitionLevel); - definitionLevel(definitionLevel); - dataColumn.writeBoolean(value); - statistics.updateStats(value); - ++ valueCount; - } - - /** - * writes the current value - * @param value - * @param repetitionLevel - * @param definitionLevel - */ - public void write(int value, int repetitionLevel, int definitionLevel) { - if (DEBUG) log(value, repetitionLevel, definitionLevel); - repetitionLevel(repetitionLevel); - definitionLevel(definitionLevel); - dataColumn.writeInteger(value); - statistics.updateStats(value); - if (bloomFilter != null) { - bloomFilter.insert(bloomFilter.hash(value)); - } - ++ valueCount; - } - - /** - * writes the current value - * @param value - * @param repetitionLevel - * @param definitionLevel - */ - public void write(long value, int repetitionLevel, int definitionLevel) { - if (DEBUG) log(value, repetitionLevel, definitionLevel); - repetitionLevel(repetitionLevel); - definitionLevel(definitionLevel); - dataColumn.writeLong(value); - statistics.updateStats(value); - if (bloomFilter != null) { - bloomFilter.insert(bloomFilter.hash(value)); - } - ++ valueCount; - } - - /** - * Finalizes the Column chunk. Possibly adding extra pages if needed (dictionary, ...) - * Is called right after writePage - */ - public void finalizeColumnChunk() { - final DictionaryPage dictionaryPage = dataColumn.toDictPageAndClose(); - if (dictionaryPage != null) { - if (DEBUG) LOG.debug("write dictionary"); + @Override + public BytesInput getBytes() { try { - pageWriter.writeDictionaryPage(dictionaryPage); + return encoder.toBytes(); } catch (IOException e) { - throw new ParquetEncodingException("could not write dictionary page for " + path, e); + throw new ParquetEncodingException(e); } - dataColumn.resetDictionary(); - } - - if (bloomFilterWriter != null && bloomFilter != null) { - bloomFilterWriter.writeBloomFilter(bloomFilter); } } - /** - * used to decide when to write a page - * @return the number of bytes of memory used to buffer the current data - */ - public long getCurrentPageBufferedSize() { - long bloomBufferSize = bloomFilter == null ? 0 : bloomFilter.getBitsetSize(); - return repetitionLevelColumn.getBufferedSize() - + definitionLevelColumn.getBufferedSize() - + dataColumn.getBufferedSize() - + bloomBufferSize; - } + private static final ValuesWriter NULL_WRITER = new DevNullValuesWriter(); - /** - * used to decide when to write a page or row group - * @return the number of bytes of memory used to buffer the current data and the previously written pages - */ - public long getTotalBufferedSize() { - long bloomBufferSize = bloomFilter == null ? 0 : bloomFilter.getBitsetSize(); - return repetitionLevelColumn.getBufferedSize() - + definitionLevelColumn.getBufferedSize() - + dataColumn.getBufferedSize() - + pageWriter.getMemSize() - + bloomBufferSize; + ColumnWriterV2(ColumnDescriptor path, PageWriter pageWriter, ParquetProperties props) { + super(path, pageWriter, props); } - /** - * @return actual memory used - */ - public long allocatedSize() { - long bloomFilterSize = bloomFilter == null ? 0 : bloomFilter.getBitsetSize(); - return repetitionLevelColumn.getAllocatedSize() - + definitionLevelColumn.getAllocatedSize() - + dataColumn.getAllocatedSize() - + pageWriter.allocatedSize() - + bloomFilterSize; + ColumnWriterV2(ColumnDescriptor path, PageWriter pageWriter, BloomFilterWriter bloomFilterWriter, + ParquetProperties props) { + super(path, pageWriter, bloomFilterWriter, props); } - /** - * @param indent a prefix to format lines - * @return a formatted string showing how memory is used - */ - public String memUsageString(String indent) { - StringBuilder b = new StringBuilder(indent).append(path).append(" {\n"); - b.append(indent).append(" r:").append(repetitionLevelColumn.getAllocatedSize()).append(" bytes\n"); - b.append(indent).append(" d:").append(definitionLevelColumn.getAllocatedSize()).append(" bytes\n"); - b.append(dataColumn.memUsageString(indent + " data:")).append("\n"); - b.append(pageWriter.memUsageString(indent + " pages:")).append("\n"); - b.append(indent).append(String.format(" total: %,d/%,d", getTotalBufferedSize(), allocatedSize())).append("\n"); - b.append(indent).append("}\n"); - return b.toString(); + @Override + ValuesWriter createRLWriter(ParquetProperties props, ColumnDescriptor path) { + return path.getMaxRepetitionLevel() == 0 ? NULL_WRITER : new RLEWriterForV2(props.newRepetitionLevelEncoder(path)); } - public long getRowsWrittenSoFar() { - return this.rowsWrittenSoFar; + @Override + ValuesWriter createDLWriter(ParquetProperties props, ColumnDescriptor path) { + return path.getMaxDefinitionLevel() == 0 ? NULL_WRITER : new RLEWriterForV2(props.newDefinitionLevelEncoder(path)); } - /** - * writes the current data to a new page in the page store - * @param rowCount how many rows have been written so far - */ - public void writePage(long rowCount) { - int pageRowCount = Ints.checkedCast(rowCount - rowsWrittenSoFar); - this.rowsWrittenSoFar = rowCount; - if (DEBUG) LOG.debug("write page"); - try { - // TODO: rework this API. Those must be called *in that order* - BytesInput bytes = dataColumn.getBytes(); - Encoding encoding = dataColumn.getEncoding(); - pageWriter.writePageV2( - pageRowCount, - Ints.checkedCast(statistics.getNumNulls()), - valueCount, - path.getMaxRepetitionLevel() == 0 ? BytesInput.empty() : repetitionLevelColumn.toBytes(), - path.getMaxDefinitionLevel() == 0 ? BytesInput.empty() : definitionLevelColumn.toBytes(), - encoding, - bytes, - statistics - ); - } catch (IOException e) { - throw new ParquetEncodingException("could not write page for " + path, e); - } - repetitionLevelColumn.reset(); - definitionLevelColumn.reset(); - dataColumn.reset(); - valueCount = 0; - resetStatistics(); + @Override + void writePage(int rowCount, int valueCount, Statistics statistics, ValuesWriter repetitionLevels, + ValuesWriter definitionLevels, ValuesWriter values) throws IOException { + // TODO: rework this API. The bytes shall be retrieved before the encoding (encoding might be different otherwise) + BytesInput bytes = values.getBytes(); + Encoding encoding = values.getEncoding(); + pageWriter.writePageV2( + rowCount, + Ints.checkedCast(statistics.getNumNulls()), + valueCount, + repetitionLevels.getBytes(), + definitionLevels.getBytes(), + encoding, + bytes, + statistics); } } diff --git a/parquet-column/src/main/java/org/apache/parquet/column/impl/SynchronizingColumnReader.java b/parquet-column/src/main/java/org/apache/parquet/column/impl/SynchronizingColumnReader.java new file mode 100644 index 0000000000..50f05c8af3 --- /dev/null +++ b/parquet-column/src/main/java/org/apache/parquet/column/impl/SynchronizingColumnReader.java @@ -0,0 +1,111 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.column.impl; + +import java.util.PrimitiveIterator; + +import org.apache.parquet.VersionParser.ParsedVersion; +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.column.ColumnReader; +import org.apache.parquet.column.page.DataPage; +import org.apache.parquet.column.page.PageReader; +import org.apache.parquet.io.RecordReader; +import org.apache.parquet.io.api.PrimitiveConverter; + +/** + * A {@link ColumnReader} implementation for utilizing indexes. When filtering using column indexes we might skip + * reading some pages for different columns. Because the rows are not aligned between the pages of the different columns + * it might be required to skip some values in this {@link ColumnReader} so we provide only the required values for the + * higher API ({@link RecordReader}) and they do not need to handle or know about the skipped pages. The values (and the + * related rl and dl) are skipped based on the iterator of the required row indexes and the first row index of each + * page.
+ * For example: + * + *

+ * rows   col1   col2   col3
+ *      ┌──────┬──────┬──────┐
+ *   0  │  p0  │      │      │
+ *      ╞══════╡  p0  │  p0  │
+ *  20  │ p1(X)│------│------│
+ *      ╞══════╪══════╡      │
+ *  40  │ p2(X)│      │------│
+ *      ╞══════╡ p1(X)╞══════╡
+ *  60  │ p3(X)│      │------│
+ *      ╞══════╪══════╡      │
+ *  80  │  p4  │      │  p1  │
+ *      ╞══════╡  p2  │      │
+ * 100  │  p5  │      │      │
+ *      └──────┴──────┴──────┘
+ * 
+ * + * The pages 1, 2, 3 in col1 are skipped so we have to skip the rows [20, 79]. Because page 1 in col2 contains values + * only for the rows [40, 79] we skip this entire page as well. To synchronize the row reading we have to skip the + * values (and the related rl and dl) for the rows [20, 39] in the end of the page 0 for col2. Similarly, we have to + * skip values while reading page0 and page1 for col3. + */ +class SynchronizingColumnReader extends ColumnReaderBase { + + private final PrimitiveIterator.OfLong rowIndexes; + private long currentRow; + private long targetRow; + private long lastRowInPage; + private int valuesReadFromPage; + + SynchronizingColumnReader(ColumnDescriptor path, PageReader pageReader, PrimitiveConverter converter, + ParsedVersion writerVersion, PrimitiveIterator.OfLong rowIndexes) { + super(path, pageReader, converter, writerVersion); + this.rowIndexes = rowIndexes; + targetRow = Long.MIN_VALUE; + consume(); + } + + @Override + boolean isPageFullyConsumed() { + return getPageValueCount() <= valuesReadFromPage || lastRowInPage < targetRow; + } + + @Override + boolean isFullyConsumed() { + return !rowIndexes.hasNext(); + } + + @Override + boolean skipRL(int rl) { + ++valuesReadFromPage; + if (rl == 0) { + ++currentRow; + if (currentRow > targetRow) { + targetRow = rowIndexes.hasNext() ? rowIndexes.nextLong() : Long.MAX_VALUE; + } + } + return currentRow < targetRow; + } + + @Override + protected void newPageInitialized(DataPage page) { + long firstRowIndex = page.getFirstRowIndex() + .orElseThrow(() -> new IllegalArgumentException("Missing firstRowIndex for synchronizing values")); + int rowCount = page.getIndexRowCount() + .orElseThrow(() -> new IllegalArgumentException("Missing rowCount for synchronizing values")); + currentRow = firstRowIndex - 1; + lastRowInPage = firstRowIndex + rowCount - 1; + valuesReadFromPage = 0; + } + +} diff --git a/parquet-column/src/main/java/org/apache/parquet/column/page/DataPage.java b/parquet-column/src/main/java/org/apache/parquet/column/page/DataPage.java index 4d8f381f51..fd1875eddf 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/page/DataPage.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/page/DataPage.java @@ -18,16 +18,24 @@ */ package org.apache.parquet.column.page; +import java.util.Optional; + /** * one data page in a chunk */ abstract public class DataPage extends Page { private final int valueCount; + private final long firstRowIndex; DataPage(int compressedSize, int uncompressedSize, int valueCount) { + this(compressedSize, uncompressedSize, valueCount, -1); + } + + DataPage(int compressedSize, int uncompressedSize, int valueCount, long firstRowIndex) { super(compressedSize, uncompressedSize); this.valueCount = valueCount; + this.firstRowIndex = firstRowIndex; } /** @@ -37,6 +45,20 @@ public int getValueCount() { return valueCount; } + /** + * @return the index of the first row in this page if the related data is available (the optional column-index + * contains this value) + */ + public Optional getFirstRowIndex() { + return firstRowIndex < 0 ? Optional.empty() : Optional.of(firstRowIndex); + } + + /** + * @return the number of rows in this page if the related data is available (in case of pageV1 the optional + * column-index contains this value) + */ + public abstract Optional getIndexRowCount(); + public abstract T accept(Visitor visitor); public static interface Visitor { diff --git a/parquet-column/src/main/java/org/apache/parquet/column/page/DataPageV1.java b/parquet-column/src/main/java/org/apache/parquet/column/page/DataPageV1.java index 56928c3818..b1f68aefba 100755 --- a/parquet-column/src/main/java/org/apache/parquet/column/page/DataPageV1.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/page/DataPageV1.java @@ -18,6 +18,8 @@ */ package org.apache.parquet.column.page; +import java.util.Optional; + import org.apache.parquet.Ints; import org.apache.parquet.bytes.BytesInput; import org.apache.parquet.column.Encoding; @@ -30,6 +32,7 @@ public class DataPageV1 extends DataPage { private final Encoding rlEncoding; private final Encoding dlEncoding; private final Encoding valuesEncoding; + private final int indexRowCount; /** * @param bytes the bytes for this page @@ -47,6 +50,29 @@ public DataPageV1(BytesInput bytes, int valueCount, int uncompressedSize, Statis this.rlEncoding = rlEncoding; this.dlEncoding = dlEncoding; this.valuesEncoding = valuesEncoding; + this.indexRowCount = -1; + } + + /** + * @param bytes the bytes for this page + * @param valueCount count of values in this page + * @param uncompressedSize the uncompressed size of the page + * @param firstRowIndex the index of the first row in this page + * @param rowCount the number of rows in this page + * @param statistics of the page's values (max, min, num_null) + * @param rlEncoding the repetition level encoding for this page + * @param dlEncoding the definition level encoding for this page + * @param valuesEncoding the values encoding for this page + */ + public DataPageV1(BytesInput bytes, int valueCount, int uncompressedSize, long firstRowIndex, int rowCount, + Statistics statistics, Encoding rlEncoding, Encoding dlEncoding, Encoding valuesEncoding) { + super(Ints.checkedCast(bytes.size()), uncompressedSize, valueCount, firstRowIndex); + this.bytes = bytes; + this.statistics = statistics; + this.rlEncoding = rlEncoding; + this.dlEncoding = dlEncoding; + this.valuesEncoding = valuesEncoding; + this.indexRowCount = rowCount; } /** @@ -94,4 +120,9 @@ public String toString() { public T accept(Visitor visitor) { return visitor.visit(this); } + + @Override + public Optional getIndexRowCount() { + return indexRowCount < 0 ? Optional.empty() : Optional.of(indexRowCount); + } } diff --git a/parquet-column/src/main/java/org/apache/parquet/column/page/DataPageV2.java b/parquet-column/src/main/java/org/apache/parquet/column/page/DataPageV2.java index 62dac83713..a1700aea00 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/page/DataPageV2.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/page/DataPageV2.java @@ -18,6 +18,8 @@ */ package org.apache.parquet.column.page; +import java.util.Optional; + import org.apache.parquet.Ints; import org.apache.parquet.bytes.BytesInput; import org.apache.parquet.column.Encoding; @@ -50,6 +52,32 @@ public static DataPageV2 uncompressed( false); } + /** + * @param rowCount count of rows + * @param nullCount count of nulls + * @param valueCount count of values + * @param firstRowIndex the index of the first row in this page + * @param repetitionLevels RLE encoded repetition levels + * @param definitionLevels RLE encoded definition levels + * @param dataEncoding encoding for the data + * @param data data encoded with dataEncoding + * @param statistics optional statistics for this page + * @return an uncompressed page + */ + public static DataPageV2 uncompressed( + int rowCount, int nullCount, int valueCount, long firstRowIndex, + BytesInput repetitionLevels, BytesInput definitionLevels, + Encoding dataEncoding, BytesInput data, + Statistics statistics) { + return new DataPageV2( + rowCount, nullCount, valueCount, firstRowIndex, + repetitionLevels, definitionLevels, + dataEncoding, data, + Ints.checkedCast(repetitionLevels.size() + definitionLevels.size() + data.size()), + statistics, + false); + } + /** * @param rowCount count of rows * @param nullCount count of nulls @@ -104,6 +132,25 @@ public DataPageV2( this.isCompressed = isCompressed; } + private DataPageV2( + int rowCount, int nullCount, int valueCount, long firstRowIndex, + BytesInput repetitionLevels, BytesInput definitionLevels, + Encoding dataEncoding, BytesInput data, + int uncompressedSize, + Statistics statistics, + boolean isCompressed) { + super(Ints.checkedCast(repetitionLevels.size() + definitionLevels.size() + data.size()), uncompressedSize, + valueCount, firstRowIndex); + this.rowCount = rowCount; + this.nullCount = nullCount; + this.repetitionLevels = repetitionLevels; + this.definitionLevels = definitionLevels; + this.dataEncoding = dataEncoding; + this.data = data; + this.statistics = statistics; + this.isCompressed = isCompressed; + } + public int getRowCount() { return rowCount; } @@ -136,6 +183,11 @@ public boolean isCompressed() { return isCompressed; } + @Override + public Optional getIndexRowCount() { + return Optional.of(rowCount); + } + @Override public T accept(Visitor visitor) { return visitor.visit(this); diff --git a/parquet-column/src/main/java/org/apache/parquet/column/page/PageReadStore.java b/parquet-column/src/main/java/org/apache/parquet/column/page/PageReadStore.java index 24d5825543..753bda8907 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/page/PageReadStore.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/page/PageReadStore.java @@ -18,6 +18,8 @@ */ package org.apache.parquet.column.page; +import java.util.Optional; +import java.util.PrimitiveIterator; import org.apache.parquet.column.ColumnDescriptor; /** @@ -29,7 +31,8 @@ public interface PageReadStore { /** * - * @param descriptor the descriptor of the column + * @param descriptor + * the descriptor of the column * @return the page reader for that column */ PageReader getPageReader(ColumnDescriptor descriptor); @@ -40,4 +43,14 @@ public interface PageReadStore { */ long getRowCount(); + /** + * Returns the indexes of the rows to be read/built if the related data is available. All the rows which index is not + * returned shall be skipped. + * + * @return the optional of the incremental iterator of the row indexes or an empty optional if the related data is not + * available + */ + default Optional getRowIndexes() { + return Optional.empty(); + } } diff --git a/parquet-column/src/main/java/org/apache/parquet/column/page/PageWriter.java b/parquet-column/src/main/java/org/apache/parquet/column/page/PageWriter.java index a2d079f9cf..a72be48b54 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/page/PageWriter.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/page/PageWriter.java @@ -20,7 +20,6 @@ import java.io.IOException; -import org.apache.parquet.bytes.ByteBufferAllocator; import org.apache.parquet.bytes.BytesInput; import org.apache.parquet.column.Encoding; import org.apache.parquet.column.statistics.Statistics; @@ -39,9 +38,25 @@ public interface PageWriter { * @param dlEncoding definition level encoding * @param valuesEncoding values encoding * @throws IOException if there is an exception while writing page data + * @deprecated will be removed in 2.0.0. This method does not support writing column indexes; Use + * {@link #writePage(BytesInput, int, int, Statistics, Encoding, Encoding, Encoding)} instead */ + @Deprecated void writePage(BytesInput bytesInput, int valueCount, Statistics statistics, Encoding rlEncoding, Encoding dlEncoding, Encoding valuesEncoding) throws IOException; + /** + * writes a single page + * @param bytesInput the bytes for the page + * @param valueCount the number of values in that page + * @param rowCount the number of rows in that page + * @param statistics the statistics for that page + * @param rlEncoding repetition level encoding + * @param dlEncoding definition level encoding + * @param valuesEncoding values encoding + * @throws IOException + */ + void writePage(BytesInput bytesInput, int valueCount, int rowCount, Statistics statistics, Encoding rlEncoding, Encoding dlEncoding, Encoding valuesEncoding) throws IOException; + /** * writes a single page in the new format * @param rowCount the number of rows in this page diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/ValuesReader.java b/parquet-column/src/main/java/org/apache/parquet/column/values/ValuesReader.java index 1154bc44ee..06771e9751 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/values/ValuesReader.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/values/ValuesReader.java @@ -179,5 +179,17 @@ public long readLong() { * Skips the next value in the page */ abstract public void skip(); + + /** + * Skips the next n values in the page + * + * @param n + * the number of values to be skipped + */ + public void skip(int n) { + for (int i = 0; i < n; ++i) { + skip(); + } + } } diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BlockSplitBloomFilter.java b/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BlockSplitBloomFilter.java index d2cf4d692c..f5ceadc428 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BlockSplitBloomFilter.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BlockSplitBloomFilter.java @@ -16,14 +16,13 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.parquet.column.values.bloomfilter; +package org.apache.parquet.column.values.bloomfilter; import com.google.common.hash.HashFunction; import com.google.common.hash.Hashing; import org.apache.parquet.Preconditions; import org.apache.parquet.bytes.BytesUtils; import org.apache.parquet.io.api.Binary; - import java.io.IOException; import java.io.OutputStream; import java.nio.ByteBuffer; @@ -79,7 +78,6 @@ public class BlockSplitBloomFilter extends BloomFilter { // of bit to set, one bit in 32-bit word. private static final int SALT[] = {0x47b6137b, 0x44974d91, 0x8824ad5b, 0xa2b7289d, 0x705495c7, 0x2df1424b, 0x9efc4947, 0x5c6bfb31}; - /** * Constructor of Bloom filter. * @@ -103,7 +101,6 @@ public BlockSplitBloomFilter(int numBytes) { */ private BlockSplitBloomFilter(int numBytes, HashStrategy hashStrategy, Algorithm algorithm) { initBitset(numBytes); - switch (hashStrategy) { case MURMUR3_X64_128: this.hashStrategy = hashStrategy; @@ -112,11 +109,9 @@ private BlockSplitBloomFilter(int numBytes, HashStrategy hashStrategy, Algorithm default: throw new RuntimeException("Not supported hash strategy"); } - this.algorithm = algorithm; } - /** * Construct the Bloom filter with given bitset, it is used when reconstructing * Bloom filter from parquet file. It use murmur3_x64_128 as its default hash @@ -140,9 +135,9 @@ private BlockSplitBloomFilter(byte[] bitset, HashStrategy hashStrategy, Algorith if (bitset == null) { throw new RuntimeException("Given bitset is null"); } + this.bitset = bitset; this.intBuffer = ByteBuffer.wrap(bitset).order(ByteOrder.LITTLE_ENDIAN).asIntBuffer(); - switch (hashStrategy) { case MURMUR3_X64_128: this.hashStrategy = hashStrategy; @@ -167,16 +162,13 @@ private void initBitset(int numBytes) { if (numBytes < MINIMUM_BLOOM_FILTER_BYTES) { numBytes = MINIMUM_BLOOM_FILTER_BYTES; } - // Get next power of 2 if it is not power of 2. if ((numBytes & (numBytes - 1)) != 0) { numBytes = Integer.highestOneBit(numBytes) << 1; } - if (numBytes > MAXIMUM_BLOOM_FILTER_BYTES || numBytes < 0) { numBytes = MAXIMUM_BLOOM_FILTER_BYTES; } - this.bitset = new byte[numBytes]; this.intBuffer = ByteBuffer.wrap(bitset).order(ByteOrder.LITTLE_ENDIAN).asIntBuffer(); } @@ -185,13 +177,10 @@ private void initBitset(int numBytes) { public void writeTo(OutputStream out) throws IOException { // Write number of bytes of bitset. out.write(BytesUtils.intToBytes(bitset.length)); - // Write hash strategy out.write(BytesUtils.intToBytes(this.hashStrategy.ordinal())); - // Write algorithm out.write(BytesUtils.intToBytes(this.algorithm.ordinal())); - // Write bitset out.write(bitset); } @@ -202,11 +191,9 @@ private int[] setMask(int key) { for (int i = 0; i < BITS_SET_PER_BLOCK; ++i) { mask[i] = key * SALT[i]; } - for (int i = 0; i < BITS_SET_PER_BLOCK; ++i) { mask[i] = mask[i] >>> 27; } - for (int i = 0; i < BITS_SET_PER_BLOCK; ++i) { mask[i] = 0x1 << mask[i]; } @@ -221,7 +208,6 @@ public void insert(long hash) { // Calculate mask for bucket. int mask[] = setMask(key); - for (int i = 0; i < BITS_SET_PER_BLOCK; i++) { int value = intBuffer.get(bucketIndex * (BYTES_PER_FILTER_BLOCK / 4) + i); value |= mask[i]; @@ -236,7 +222,6 @@ public boolean find(long hash) { // Calculate mask for the tiny Bloom filter. int mask[] = setMask(key); - for (int i = 0; i < BITS_SET_PER_BLOCK; i++) { if (0 == (intBuffer.get(bucketIndex * (BYTES_PER_FILTER_BLOCK / 4) + i) & mask[i])) { return false; @@ -256,7 +241,6 @@ public boolean find(long hash) { public static int optimalNumOfBits(long n, double p) { Preconditions.checkArgument((p > 0.0 && p < 1.0), "FPP should be less than 1.0 and great than 0.0"); - final double m = -8 * n / Math.log(1 - Math.pow(p, 1.0 / 8)); final double MAX = MAXIMUM_BLOOM_FILTER_BYTES << 3; int numBits = (int)m; @@ -265,12 +249,10 @@ public static int optimalNumOfBits(long n, double p) { if (m > MAX || m < 0) { numBits = (int)MAX; } - // Get next power of 2 if bits is not power of 2. if ((numBits & (numBits - 1)) != 0) { numBits = Integer.highestOneBit(numBits) << 1; } - if (numBits < (MINIMUM_BLOOM_FILTER_BYTES << 3)) { numBits = MINIMUM_BLOOM_FILTER_BYTES << 3; } diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BloomFilter.java b/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BloomFilter.java index 430fab8d61..4199497fd9 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BloomFilter.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BloomFilter.java @@ -19,7 +19,6 @@ package org.apache.parquet.column.values.bloomfilter; import org.apache.parquet.io.api.Binary; - import java.io.IOException; import java.io.OutputStream; @@ -94,6 +93,7 @@ public enum Algorithm { * @return hash result */ public abstract long hash(float value); + /** * Compute hash for Binary value by using its plain encoding result. * diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BloomFilterReadStore.java b/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BloomFilterReadStore.java index bdc51755b0..3373bc1a0e 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BloomFilterReadStore.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BloomFilterReadStore.java @@ -23,7 +23,6 @@ /** * contains all the bloom filter reader for all columns of a row group */ - public interface BloomFilterReadStore { /** * Get a Bloom filter reader of a column @@ -33,5 +32,3 @@ public interface BloomFilterReadStore { */ BloomFilterReader getBloomFilterReader(ColumnDescriptor path); } - - diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BloomFilterReader.java b/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BloomFilterReader.java index 39b25e2a49..7a430581dd 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BloomFilterReader.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BloomFilterReader.java @@ -16,6 +16,7 @@ * specific language governing permissions and limitations * under the License. */ + package org.apache.parquet.column.values.bloomfilter; import org.apache.parquet.column.ColumnDescriptor; diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BloomFilterWriteStore.java b/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BloomFilterWriteStore.java index f472104daa..f7e28fdf2d 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BloomFilterWriteStore.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BloomFilterWriteStore.java @@ -16,6 +16,7 @@ * specific language governing permissions and limitations * under the License. */ + package org.apache.parquet.column.values.bloomfilter; import org.apache.parquet.column.ColumnDescriptor; diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BloomFilterWriter.java b/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BloomFilterWriter.java index 388e779968..0fab73b2a4 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BloomFilterWriter.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BloomFilterWriter.java @@ -1,3 +1,5 @@ + + /* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file @@ -16,14 +18,16 @@ * specific language governing permissions and limitations * under the License. */ + package org.apache.parquet.column.values.bloomfilter; public interface BloomFilterWriter { /** - * Write a bloom filter + * Write a Bloom filter * - * @param bloomFilter the bloom filter to write + * @param bloomFilter the Bloom filter to write * */ void writeBloomFilter(BloomFilter bloomFilter); } + diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/delta/DeltaBinaryPackingValuesReader.java b/parquet-column/src/main/java/org/apache/parquet/column/values/delta/DeltaBinaryPackingValuesReader.java index c8a80fd308..80cfaf2b04 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/values/delta/DeltaBinaryPackingValuesReader.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/values/delta/DeltaBinaryPackingValuesReader.java @@ -90,6 +90,14 @@ public void skip() { valuesRead++; } + @Override + public void skip(int n) { + // checkRead() is invoked before incrementing valuesRead so increase valuesRead size in 2 steps + valuesRead += n - 1; + checkRead(); + ++valuesRead; + } + @Override public int readInteger() { // TODO: probably implement it separately diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/deltalengthbytearray/DeltaLengthByteArrayValuesReader.java b/parquet-column/src/main/java/org/apache/parquet/column/values/deltalengthbytearray/DeltaLengthByteArrayValuesReader.java index 1a2ccb9b53..4dbbcb5645 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/values/deltalengthbytearray/DeltaLengthByteArrayValuesReader.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/values/deltalengthbytearray/DeltaLengthByteArrayValuesReader.java @@ -20,8 +20,6 @@ import java.io.IOException; -import java.nio.ByteBuffer; - import org.apache.parquet.bytes.ByteBufferInputStream; import org.apache.parquet.column.values.ValuesReader; import org.apache.parquet.column.values.delta.DeltaBinaryPackingValuesReader; @@ -64,7 +62,15 @@ public Binary readBytes() { @Override public void skip() { - int length = lengthReader.readInteger(); + skip(1); + } + + @Override + public void skip(int n) { + int length = 0; + for (int i = 0; i < n; ++i) { + length += lengthReader.readInteger(); + } try { in.skipFully(length); } catch (IOException e) { diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/plain/FixedLenByteArrayPlainValuesReader.java b/parquet-column/src/main/java/org/apache/parquet/column/values/plain/FixedLenByteArrayPlainValuesReader.java index 15ed43438f..631c9084d1 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/values/plain/FixedLenByteArrayPlainValuesReader.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/values/plain/FixedLenByteArrayPlainValuesReader.java @@ -19,7 +19,6 @@ package org.apache.parquet.column.values.plain; import java.io.IOException; -import java.nio.ByteBuffer; import org.apache.parquet.bytes.ByteBufferInputStream; import org.apache.parquet.column.values.ValuesReader; import org.apache.parquet.io.ParquetDecodingException; @@ -51,8 +50,13 @@ public Binary readBytes() { @Override public void skip() { + skip(1); + } + + @Override + public void skip(int n) { try { - in.skipFully(length); + in.skipFully(n * length); } catch (IOException | RuntimeException e) { throw new ParquetDecodingException("could not skip bytes at offset " + in.position(), e); } diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/plain/PlainValuesReader.java b/parquet-column/src/main/java/org/apache/parquet/column/values/plain/PlainValuesReader.java index f576528a98..127817eb0c 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/values/plain/PlainValuesReader.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/values/plain/PlainValuesReader.java @@ -41,14 +41,26 @@ public void initFromPage(int valueCount, ByteBufferInputStream stream) throws IO this.in = new LittleEndianDataInputStream(stream.remainingStream()); } + @Override + public void skip() { + skip(1); + } + + void skipBytesFully(int n) throws IOException { + int skipped = 0; + while (skipped < n) { + skipped += in.skipBytes(n - skipped); + } + } + public static class DoublePlainValuesReader extends PlainValuesReader { @Override - public void skip() { + public void skip(int n) { try { - in.skipBytes(8); + skipBytesFully(n * 8); } catch (IOException e) { - throw new ParquetDecodingException("could not skip double", e); + throw new ParquetDecodingException("could not skip " + n + " double values", e); } } @@ -65,11 +77,11 @@ public double readDouble() { public static class FloatPlainValuesReader extends PlainValuesReader { @Override - public void skip() { + public void skip(int n) { try { - in.skipBytes(4); + skipBytesFully(n * 4); } catch (IOException e) { - throw new ParquetDecodingException("could not skip float", e); + throw new ParquetDecodingException("could not skip " + n + " floats", e); } } @@ -86,11 +98,11 @@ public float readFloat() { public static class IntegerPlainValuesReader extends PlainValuesReader { @Override - public void skip() { + public void skip(int n) { try { - in.skipBytes(4); + in.skipBytes(n * 4); } catch (IOException e) { - throw new ParquetDecodingException("could not skip int", e); + throw new ParquetDecodingException("could not skip " + n + " ints", e); } } @@ -107,11 +119,11 @@ public int readInteger() { public static class LongPlainValuesReader extends PlainValuesReader { @Override - public void skip() { + public void skip(int n) { try { - in.skipBytes(8); + in.skipBytes(n * 8); } catch (IOException e) { - throw new ParquetDecodingException("could not skip long", e); + throw new ParquetDecodingException("could not skip " + n + " longs", e); } } diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/rle/RunLengthBitPackingHybridValuesWriter.java b/parquet-column/src/main/java/org/apache/parquet/column/values/rle/RunLengthBitPackingHybridValuesWriter.java index 3b7a5def47..a51a8c4d82 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/values/rle/RunLengthBitPackingHybridValuesWriter.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/values/rle/RunLengthBitPackingHybridValuesWriter.java @@ -19,6 +19,7 @@ package org.apache.parquet.column.values.rle; import java.io.IOException; +import java.util.Objects; import org.apache.parquet.bytes.ByteBufferAllocator; import org.apache.parquet.Ints; @@ -28,10 +29,14 @@ import org.apache.parquet.io.ParquetEncodingException; public class RunLengthBitPackingHybridValuesWriter extends ValuesWriter { - private final RunLengthBitPackingHybridEncoder encoder; + protected final RunLengthBitPackingHybridEncoder encoder; public RunLengthBitPackingHybridValuesWriter(int bitWidth, int initialCapacity, int pageSize, ByteBufferAllocator allocator) { - this.encoder = new RunLengthBitPackingHybridEncoder(bitWidth, initialCapacity, pageSize, allocator); + this(new RunLengthBitPackingHybridEncoder(bitWidth, initialCapacity, pageSize, allocator)); + } + + protected RunLengthBitPackingHybridValuesWriter(RunLengthBitPackingHybridEncoder encoder) { + this.encoder = Objects.requireNonNull(encoder); } @Override diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/rle/ZeroIntegerValuesReader.java b/parquet-column/src/main/java/org/apache/parquet/column/values/rle/ZeroIntegerValuesReader.java index beeb0ad2ed..09ca8a1a47 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/values/rle/ZeroIntegerValuesReader.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/values/rle/ZeroIntegerValuesReader.java @@ -42,4 +42,8 @@ public void initFromPage(int valueCount, ByteBufferInputStream stream) throws IO public void skip() { } + @Override + public void skip(int n) { + } + } diff --git a/parquet-column/src/main/java/org/apache/parquet/filter2/predicate/ValidTypeMap.java b/parquet-column/src/main/java/org/apache/parquet/filter2/predicate/ValidTypeMap.java index b8f48bb0b8..62c174e547 100644 --- a/parquet-column/src/main/java/org/apache/parquet/filter2/predicate/ValidTypeMap.java +++ b/parquet-column/src/main/java/org/apache/parquet/filter2/predicate/ValidTypeMap.java @@ -1,4 +1,4 @@ -/* +/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -6,9 +6,9 @@ * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -25,7 +25,6 @@ import org.apache.parquet.filter2.predicate.Operators.Column; import org.apache.parquet.hadoop.metadata.ColumnPath; -import org.apache.parquet.schema.OriginalType; import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; /** diff --git a/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/BinaryColumnIndexBuilder.java b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/BinaryColumnIndexBuilder.java new file mode 100644 index 0000000000..490cc3e9b3 --- /dev/null +++ b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/BinaryColumnIndexBuilder.java @@ -0,0 +1,140 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.internal.column.columnindex; + +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.List; + +import org.apache.parquet.filter2.predicate.Statistics; +import org.apache.parquet.io.api.Binary; +import org.apache.parquet.schema.PrimitiveComparator; +import org.apache.parquet.schema.PrimitiveType; + +class BinaryColumnIndexBuilder extends ColumnIndexBuilder { + private static class BinaryColumnIndex extends ColumnIndexBase { + private Binary[] minValues; + private Binary[] maxValues; + + private BinaryColumnIndex(PrimitiveType type) { + super(type); + } + + @Override + ByteBuffer getMinValueAsBytes(int pageIndex) { + return convert(minValues[pageIndex]); + } + + @Override + ByteBuffer getMaxValueAsBytes(int pageIndex) { + return convert(maxValues[pageIndex]); + } + + @Override + String getMinValueAsString(int pageIndex) { + return stringifier.stringify(minValues[pageIndex]); + } + + @Override + String getMaxValueAsString(int pageIndex) { + return stringifier.stringify(maxValues[pageIndex]); + } + + @Override + @SuppressWarnings("unchecked") + > Statistics createStats(int arrayIndex) { + return (Statistics) new Statistics(minValues[arrayIndex], maxValues[arrayIndex], comparator); + } + + @Override + ValueComparator createValueComparator(Object value) { + final Binary v = (Binary) value; + return new ValueComparator() { + @Override + int compareValueToMin(int arrayIndex) { + return comparator.compare(v, minValues[arrayIndex]); + } + + @Override + int compareValueToMax(int arrayIndex) { + return comparator.compare(v, maxValues[arrayIndex]); + } + }; + } + } + + private final List minValues = new ArrayList<>(); + private final List maxValues = new ArrayList<>(); + private final BinaryTruncator truncator; + private final int truncateLength; + + private static Binary convert(ByteBuffer buffer) { + return Binary.fromReusedByteBuffer(buffer); + } + + private static ByteBuffer convert(Binary value) { + return value.toByteBuffer(); + } + + BinaryColumnIndexBuilder(PrimitiveType type, int truncateLength) { + truncator = BinaryTruncator.getTruncator(type); + this.truncateLength = truncateLength; + } + + @Override + void addMinMaxFromBytes(ByteBuffer min, ByteBuffer max) { + minValues.add(convert(min)); + maxValues.add(convert(max)); + } + + @Override + void addMinMax(Object min, Object max) { + minValues.add(min == null ? null : truncator.truncateMin((Binary) min, truncateLength)); + maxValues.add(max == null ? null : truncator.truncateMax((Binary) max, truncateLength)); + } + + @Override + ColumnIndexBase createColumnIndex(PrimitiveType type) { + BinaryColumnIndex columnIndex = new BinaryColumnIndex(type); + columnIndex.minValues = minValues.toArray(new Binary[minValues.size()]); + columnIndex.maxValues = maxValues.toArray(new Binary[maxValues.size()]); + return columnIndex; + } + + @Override + void clearMinMax() { + minValues.clear(); + maxValues.clear(); + } + + @Override + int compareMinValues(PrimitiveComparator comparator, int index1, int index2) { + return comparator.compare(minValues.get(index1), minValues.get(index2)); + } + + @Override + int compareMaxValues(PrimitiveComparator comparator, int index1, int index2) { + return comparator.compare(maxValues.get(index1), maxValues.get(index2)); + } + + @Override + int sizeOf(Object value) { + return ((Binary) value).length(); + } +} diff --git a/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/BinaryTruncator.java b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/BinaryTruncator.java new file mode 100644 index 0000000000..bcc43fb866 --- /dev/null +++ b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/BinaryTruncator.java @@ -0,0 +1,208 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.internal.column.columnindex; + +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.Charset; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CoderResult; +import java.nio.charset.CodingErrorAction; +import java.nio.charset.StandardCharsets; + +import org.apache.parquet.io.api.Binary; +import org.apache.parquet.schema.OriginalType; +import org.apache.parquet.schema.PrimitiveType; + +/** + * Class for truncating min/max values for binary types. + */ +abstract class BinaryTruncator { + enum Validity { + VALID, MALFORMED, UNMAPPABLE; + } + + private static class CharsetValidator { + private final CharBuffer dummyBuffer = CharBuffer.allocate(1024); + private final CharsetDecoder decoder; + + CharsetValidator(Charset charset) { + decoder = charset.newDecoder(); + decoder.onMalformedInput(CodingErrorAction.REPORT); + decoder.onUnmappableCharacter(CodingErrorAction.REPORT); + } + + Validity checkValidity(ByteBuffer buffer) { + int pos = buffer.position(); + CoderResult result = CoderResult.OVERFLOW; + while (result.isOverflow()) { + dummyBuffer.clear(); + result = decoder.decode(buffer, dummyBuffer, true); + } + buffer.position(pos); + if (result.isUnderflow()) { + return Validity.VALID; + } else if (result.isMalformed()) { + return Validity.MALFORMED; + } else { + return Validity.UNMAPPABLE; + } + } + } + + private static final BinaryTruncator NO_OP_TRUNCATOR = new BinaryTruncator() { + @Override + Binary truncateMin(Binary minValue, int length) { + return minValue; + } + + @Override + Binary truncateMax(Binary maxValue, int length) { + return maxValue; + } + }; + + private static final BinaryTruncator DEFAULT_UTF8_TRUNCATOR = new BinaryTruncator() { + private final CharsetValidator validator = new CharsetValidator(StandardCharsets.UTF_8); + + @Override + Binary truncateMin(Binary minValue, int length) { + if (minValue.length() <= length) { + return minValue; + } + ByteBuffer buffer = minValue.toByteBuffer(); + byte[] array; + if (validator.checkValidity(buffer) == Validity.VALID) { + array = truncateUtf8(buffer, length); + } else { + array = truncate(buffer, length); + } + return array == null ? minValue : Binary.fromConstantByteArray(array); + } + + @Override + Binary truncateMax(Binary maxValue, int length) { + if (maxValue.length() <= length) { + return maxValue; + } + byte[] array; + ByteBuffer buffer = maxValue.toByteBuffer(); + if (validator.checkValidity(buffer) == Validity.VALID) { + array = incrementUtf8(truncateUtf8(buffer, length)); + } else { + array = increment(truncate(buffer, length)); + } + return array == null ? maxValue : Binary.fromConstantByteArray(array); + } + + // Simply truncate to length + private byte[] truncate(ByteBuffer buffer, int length) { + assert length < buffer.remaining(); + byte[] array = new byte[length]; + buffer.get(array); + return array; + } + + // Trying to increment the bytes from the last one to the beginning + private byte[] increment(byte[] array) { + for (int i = array.length - 1; i >= 0; --i) { + byte elem = array[i]; + ++elem; + array[i] = elem; + if (elem != 0) { // Did not overflow: 0xFF -> 0x00 + return array; + } + } + return null; + } + + // Truncates the buffer to length or less so the remaining bytes form a valid UTF-8 string + private byte[] truncateUtf8(ByteBuffer buffer, int length) { + assert length < buffer.remaining(); + ByteBuffer newBuffer = buffer.slice(); + newBuffer.limit(newBuffer.position() + length); + while (validator.checkValidity(newBuffer) != Validity.VALID) { + newBuffer.limit(newBuffer.limit() - 1); + if (newBuffer.remaining() == 0) { + return null; + } + } + byte[] array = new byte[newBuffer.remaining()]; + newBuffer.get(array); + return array; + } + + // Trying to increment the bytes from the last one to the beginning until the bytes form a valid UTF-8 string + private byte[] incrementUtf8(byte[] array) { + if (array == null) { + return null; + } + ByteBuffer buffer = ByteBuffer.wrap(array); + for (int i = array.length - 1; i >= 0; --i) { + byte prev = array[i]; + byte inc = prev; + while (++inc != 0) { // Until overflow: 0xFF -> 0x00 + array[i] = inc; + switch (validator.checkValidity(buffer)) { + case VALID: + return array; + case UNMAPPABLE: + continue; // Increment the i byte once more + case MALFORMED: + break; // Stop incrementing the i byte; go to the i-1 + } + break; // MALFORMED + } + array[i] = prev; + } + return null; // All characters are the largest possible; unable to increment + } + }; + + static BinaryTruncator getTruncator(PrimitiveType type) { + if (type == null) { + return NO_OP_TRUNCATOR; + } + switch (type.getPrimitiveTypeName()) { + case INT96: + return NO_OP_TRUNCATOR; + case BINARY: + case FIXED_LEN_BYTE_ARRAY: + OriginalType originalType = type.getOriginalType(); + if (originalType == null) { + return DEFAULT_UTF8_TRUNCATOR; + } + switch (originalType) { + case UTF8: + case ENUM: + case JSON: + case BSON: + return DEFAULT_UTF8_TRUNCATOR; + default: + return NO_OP_TRUNCATOR; + } + default: + throw new IllegalArgumentException("No truncator is available for the type: " + type); + } + } + + abstract Binary truncateMin(Binary minValue, int length); + + abstract Binary truncateMax(Binary maxValue, int length); +} diff --git a/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/BooleanColumnIndexBuilder.java b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/BooleanColumnIndexBuilder.java new file mode 100644 index 0000000000..233bd1b026 --- /dev/null +++ b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/BooleanColumnIndexBuilder.java @@ -0,0 +1,133 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.internal.column.columnindex; + +import java.nio.ByteBuffer; +import org.apache.parquet.filter2.predicate.Statistics; +import org.apache.parquet.io.api.Binary; +import org.apache.parquet.schema.PrimitiveComparator; +import org.apache.parquet.schema.PrimitiveType; + +import it.unimi.dsi.fastutil.booleans.BooleanArrayList; +import it.unimi.dsi.fastutil.booleans.BooleanList; + +class BooleanColumnIndexBuilder extends ColumnIndexBuilder { + private static class BooleanColumnIndex extends ColumnIndexBase { + private boolean[] minValues; + private boolean[] maxValues; + + private BooleanColumnIndex(PrimitiveType type) { + super(type); + } + + @Override + ByteBuffer getMinValueAsBytes(int pageIndex) { + return convert(minValues[pageIndex]); + } + + @Override + ByteBuffer getMaxValueAsBytes(int pageIndex) { + return convert(maxValues[pageIndex]); + } + + @Override + String getMinValueAsString(int pageIndex) { + return stringifier.stringify(minValues[pageIndex]); + } + + @Override + String getMaxValueAsString(int pageIndex) { + return stringifier.stringify(maxValues[pageIndex]); + } + + @Override + @SuppressWarnings("unchecked") + > Statistics createStats(int arrayIndex) { + return (Statistics) new Statistics(minValues[arrayIndex], maxValues[arrayIndex], comparator); + } + + @Override + ValueComparator createValueComparator(Object value) { + final boolean v = (boolean) value; + return new ValueComparator() { + @Override + int compareValueToMin(int arrayIndex) { + return comparator.compare(v, minValues[arrayIndex]); + } + + @Override + int compareValueToMax(int arrayIndex) { + return comparator.compare(v, maxValues[arrayIndex]); + } + }; + } + } + + private final BooleanList minValues = new BooleanArrayList(); + private final BooleanList maxValues = new BooleanArrayList(); + + private static boolean convert(ByteBuffer buffer) { + return buffer.get(0) != 0; + } + + private static ByteBuffer convert(boolean value) { + return ByteBuffer.allocate(1).put(0, value ? (byte) 1 : 0); + } + + @Override + void addMinMaxFromBytes(ByteBuffer min, ByteBuffer max) { + minValues.add(convert(min)); + maxValues.add(convert(max)); + } + + @Override + void addMinMax(Object min, Object max) { + minValues.add((boolean) min); + maxValues.add((boolean) max); + } + + @Override + ColumnIndexBase createColumnIndex(PrimitiveType type) { + BooleanColumnIndex columnIndex = new BooleanColumnIndex(type); + columnIndex.minValues = minValues.toBooleanArray(); + columnIndex.maxValues = maxValues.toBooleanArray(); + return columnIndex; + } + + @Override + void clearMinMax() { + minValues.clear(); + maxValues.clear(); + } + + @Override + int compareMinValues(PrimitiveComparator comparator, int index1, int index2) { + return comparator.compare(minValues.get(index1), minValues.get(index2)); + } + + @Override + int compareMaxValues(PrimitiveComparator comparator, int index1, int index2) { + return comparator.compare(maxValues.get(index1), maxValues.get(index2)); + } + + @Override + int sizeOf(Object value) { + return 1; + } +} diff --git a/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/BoundaryOrder.java b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/BoundaryOrder.java new file mode 100644 index 0000000000..e47b5b3f1a --- /dev/null +++ b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/BoundaryOrder.java @@ -0,0 +1,352 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.internal.column.columnindex; + +import java.util.PrimitiveIterator; +import java.util.PrimitiveIterator.OfInt; + +import org.apache.parquet.internal.column.columnindex.ColumnIndexBuilder.ColumnIndexBase; + +/** + * Enum for {@link org.apache.parquet.format.BoundaryOrder}. It also contains the implementations of searching for + * matching page indexes for column index based filtering. + */ +public enum BoundaryOrder { + UNORDERED { + @Override + PrimitiveIterator.OfInt eq(ColumnIndexBase.ValueComparator comparator) { + return IndexIterator.filterTranslate(comparator.arrayLength(), + arrayIndex -> comparator.compareValueToMin(arrayIndex) >= 0 && comparator.compareValueToMax(arrayIndex) <= 0, + comparator::translate); + } + + @Override + PrimitiveIterator.OfInt gt(ColumnIndexBase.ValueComparator comparator) { + return IndexIterator.filterTranslate(comparator.arrayLength(), + arrayIndex -> comparator.compareValueToMax(arrayIndex) < 0, + comparator::translate); + } + + @Override + PrimitiveIterator.OfInt gtEq(ColumnIndexBase.ValueComparator comparator) { + return IndexIterator.filterTranslate(comparator.arrayLength(), + arrayIndex -> comparator.compareValueToMax(arrayIndex) <= 0, + comparator::translate); + } + + @Override + PrimitiveIterator.OfInt lt(ColumnIndexBase.ValueComparator comparator) { + return IndexIterator.filterTranslate(comparator.arrayLength(), + arrayIndex -> comparator.compareValueToMin(arrayIndex) > 0, + comparator::translate); + } + + @Override + PrimitiveIterator.OfInt ltEq(ColumnIndexBase.ValueComparator comparator) { + return IndexIterator.filterTranslate(comparator.arrayLength(), + arrayIndex -> comparator.compareValueToMin(arrayIndex) >= 0, + comparator::translate); + } + + @Override + PrimitiveIterator.OfInt notEq(ColumnIndexBase.ValueComparator comparator) { + return IndexIterator.filterTranslate(comparator.arrayLength(), + arrayIndex -> comparator.compareValueToMin(arrayIndex) != 0 || comparator.compareValueToMax(arrayIndex) != 0, + comparator::translate); + } + }, + ASCENDING { + @Override + OfInt eq(ColumnIndexBase.ValueComparator comparator) { + Bounds bounds = findBounds(comparator); + if (bounds == null) { + return IndexIterator.EMPTY; + } + return IndexIterator.rangeTranslate(bounds.lower, bounds.upper, comparator::translate); + } + + @Override + OfInt gt(ColumnIndexBase.ValueComparator comparator) { + int length = comparator.arrayLength(); + int left = 0; + int right = length; + do { + int i = floorMid(left, right); + if (comparator.compareValueToMax(i) >= 0) { + left = i + 1; + } else { + right = i; + } + } while (left < right); + return IndexIterator.rangeTranslate(right, length - 1, comparator::translate); + } + + @Override + OfInt gtEq(ColumnIndexBase.ValueComparator comparator) { + int length = comparator.arrayLength(); + int left = 0; + int right = length; + do { + int i = floorMid(left, right); + if (comparator.compareValueToMax(i) > 0) { + left = i + 1; + } else { + right = i; + } + } while (left < right); + return IndexIterator.rangeTranslate(right, length - 1, comparator::translate); + } + + @Override + OfInt lt(ColumnIndexBase.ValueComparator comparator) { + int length = comparator.arrayLength(); + int left = -1; + int right = length - 1; + do { + int i = ceilingMid(left, right); + if (comparator.compareValueToMin(i) <= 0) { + right = i - 1; + } else { + left = i; + } + } while (left < right); + return IndexIterator.rangeTranslate(0, left, comparator::translate); + } + + @Override + OfInt ltEq(ColumnIndexBase.ValueComparator comparator) { + int length = comparator.arrayLength(); + int left = -1; + int right = length - 1; + do { + int i = ceilingMid(left, right); + if (comparator.compareValueToMin(i) < 0) { + right = i - 1; + } else { + left = i; + } + } while (left < right); + return IndexIterator.rangeTranslate(0, left, comparator::translate); + } + + @Override + OfInt notEq(ColumnIndexBase.ValueComparator comparator) { + Bounds bounds = findBounds(comparator); + int length = comparator.arrayLength(); + if (bounds == null) { + return IndexIterator.all(comparator); + } + return IndexIterator.filterTranslate( + length, + i -> i < bounds.lower || i > bounds.upper || comparator.compareValueToMin(i) != 0 + || comparator.compareValueToMax(i) != 0, + comparator::translate); + } + + private Bounds findBounds(ColumnIndexBase.ValueComparator comparator) { + int length = comparator.arrayLength(); + int lowerLeft = 0; + int upperLeft = 0; + int lowerRight = length - 1; + int upperRight = length - 1; + do { + if (lowerLeft > lowerRight) { + return null; + } + int i = floorMid(lowerLeft, lowerRight); + if (comparator.compareValueToMin(i) < 0) { + lowerRight = upperRight = i - 1; + } else if (comparator.compareValueToMax(i) > 0) { + lowerLeft = upperLeft = i + 1; + } else { + lowerRight = upperLeft = i; + } + } while (lowerLeft != lowerRight); + do { + if (upperLeft > upperRight) { + return null; + } + int i = ceilingMid(upperLeft, upperRight); + if (comparator.compareValueToMin(i) < 0) { + upperRight = i - 1; + } else if (comparator.compareValueToMax(i) > 0) { + upperLeft = i + 1; + } else { + upperLeft = i; + } + } while (upperLeft != upperRight); + return new Bounds(lowerLeft, upperRight); + } + }, + DESCENDING { + @Override + OfInt eq(ColumnIndexBase.ValueComparator comparator) { + Bounds bounds = findBounds(comparator); + if (bounds == null) { + return IndexIterator.EMPTY; + } + return IndexIterator.rangeTranslate(bounds.lower, bounds.upper, comparator::translate); + } + + @Override + OfInt gt(ColumnIndexBase.ValueComparator comparator) { + int length = comparator.arrayLength(); + int left = -1; + int right = length - 1; + do { + int i = ceilingMid(left, right); + if (comparator.compareValueToMax(i) >= 0) { + right = i - 1; + } else { + left = i; + } + } while (left < right); + return IndexIterator.rangeTranslate(0, left, comparator::translate); + } + + @Override + OfInt gtEq(ColumnIndexBase.ValueComparator comparator) { + int length = comparator.arrayLength(); + int left = -1; + int right = length - 1; + do { + int i = ceilingMid(left, right); + if (comparator.compareValueToMax(i) > 0) { + right = i - 1; + } else { + left = i; + } + } while (left < right); + return IndexIterator.rangeTranslate(0, left, comparator::translate); + } + + @Override + OfInt lt(ColumnIndexBase.ValueComparator comparator) { + int length = comparator.arrayLength(); + int left = 0; + int right = length; + do { + int i = floorMid(left, right); + if (comparator.compareValueToMin(i) <= 0) { + left = i + 1; + } else { + right = i; + } + } while (left < right); + return IndexIterator.rangeTranslate(right, length - 1, comparator::translate); + } + + @Override + OfInt ltEq(ColumnIndexBase.ValueComparator comparator) { + int length = comparator.arrayLength(); + int left = 0; + int right = length; + do { + int i = floorMid(left, right); + if (comparator.compareValueToMin(i) < 0) { + left = i + 1; + } else { + right = i; + } + } while (left < right); + return IndexIterator.rangeTranslate(right, length - 1, comparator::translate); + } + + @Override + OfInt notEq(ColumnIndexBase.ValueComparator comparator) { + Bounds bounds = findBounds(comparator); + int length = comparator.arrayLength(); + if (bounds == null) { + return IndexIterator.all(comparator); + } + return IndexIterator.filterTranslate( + length, + i -> i < bounds.lower || i > bounds.upper || comparator.compareValueToMin(i) != 0 + || comparator.compareValueToMax(i) != 0, + comparator::translate); + } + + private Bounds findBounds(ColumnIndexBase.ValueComparator comparator) { + int length = comparator.arrayLength(); + int lowerLeft = 0; + int upperLeft = 0; + int lowerRight = length - 1; + int upperRight = length - 1; + do { + if (lowerLeft > lowerRight) { + return null; + } + int i = floorMid(lowerLeft, lowerRight); + if (comparator.compareValueToMax(i) > 0) { + lowerRight = upperRight = i - 1; + } else if (comparator.compareValueToMin(i) < 0) { + lowerLeft = upperLeft = i + 1; + } else { + lowerRight = upperLeft = i; + } + } while (lowerLeft != lowerRight); + do { + if (upperLeft > upperRight) { + return null; + } + int i = ceilingMid(upperLeft, upperRight); + if (comparator.compareValueToMax(i) > 0) { + upperRight = i - 1; + } else if (comparator.compareValueToMin(i) < 0) { + upperLeft = i + 1; + } else { + upperLeft = i; + } + } while (upperLeft != upperRight); + return new Bounds(lowerLeft, upperRight); + } + }; + + private static class Bounds { + final int lower, upper; + + Bounds(int lower, int upper) { + assert lower <= upper; + this.lower = lower; + this.upper = upper; + } + } + + private static int floorMid(int left, int right) { + // Avoid the possible overflow might happen in case of (left + right) / 2 + return left + ((right - left) / 2); + } + + private static int ceilingMid(int left, int right) { + // Avoid the possible overflow might happen in case of (left + right + 1) / 2 + return left + ((right - left + 1) / 2); + } + + abstract PrimitiveIterator.OfInt eq(ColumnIndexBase.ValueComparator comparator); + + abstract PrimitiveIterator.OfInt gt(ColumnIndexBase.ValueComparator comparator); + + abstract PrimitiveIterator.OfInt gtEq(ColumnIndexBase.ValueComparator comparator); + + abstract PrimitiveIterator.OfInt lt(ColumnIndexBase.ValueComparator comparator); + + abstract PrimitiveIterator.OfInt ltEq(ColumnIndexBase.ValueComparator comparator); + + abstract PrimitiveIterator.OfInt notEq(ColumnIndexBase.ValueComparator comparator); +} diff --git a/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/ColumnIndex.java b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/ColumnIndex.java new file mode 100644 index 0000000000..b91a5c0d96 --- /dev/null +++ b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/ColumnIndex.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.internal.column.columnindex; + +import java.nio.ByteBuffer; +import java.util.List; +import java.util.PrimitiveIterator; + +import org.apache.parquet.filter2.predicate.FilterPredicate.Visitor; +import org.apache.parquet.internal.filter2.columnindex.ColumnIndexFilter; + +/** + * Column index containing min/max and null count values for the pages in a column chunk. It also implements methods of + * {@link Visitor} to return the indexes of the matching pages. They are used by {@link ColumnIndexFilter}. + * + * @see org.apache.parquet.format.ColumnIndex + */ +public interface ColumnIndex extends Visitor { + /** + * @return the boundary order of the min/max values; used for converting to the related thrift object + */ + public BoundaryOrder getBoundaryOrder(); + + /** + * @return the unmodifiable list of null counts; used for converting to the related thrift object + */ + public List getNullCounts(); + + /** + * @return the unmodifiable list of null pages; used for converting to the related thrift object + */ + public List getNullPages(); + + /** + * @return the list of the min values as {@link ByteBuffer}s; used for converting to the related thrift object + */ + public List getMinValues(); + + /** + * @return the list of the max values as {@link ByteBuffer}s; used for converting to the related thrift object + */ + public List getMaxValues(); + +} diff --git a/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/ColumnIndexBuilder.java b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/ColumnIndexBuilder.java new file mode 100644 index 0000000000..b28fddee42 --- /dev/null +++ b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/ColumnIndexBuilder.java @@ -0,0 +1,636 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.internal.column.columnindex; + +import static java.util.Objects.requireNonNull; + +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.EnumMap; +import java.util.Formatter; +import java.util.List; +import java.util.Map; +import java.util.PrimitiveIterator; +import java.util.function.IntPredicate; + +import org.apache.parquet.column.statistics.Statistics; +import org.apache.parquet.filter2.predicate.UserDefinedPredicate; +import org.apache.parquet.filter2.predicate.Operators.And; +import org.apache.parquet.filter2.predicate.Operators.Eq; +import org.apache.parquet.filter2.predicate.Operators.Gt; +import org.apache.parquet.filter2.predicate.Operators.GtEq; +import org.apache.parquet.filter2.predicate.Operators.LogicalNotUserDefined; +import org.apache.parquet.filter2.predicate.Operators.Lt; +import org.apache.parquet.filter2.predicate.Operators.LtEq; +import org.apache.parquet.filter2.predicate.Operators.Not; +import org.apache.parquet.filter2.predicate.Operators.NotEq; +import org.apache.parquet.filter2.predicate.Operators.Or; +import org.apache.parquet.filter2.predicate.Operators.UserDefined; +import org.apache.parquet.io.api.Binary; +import org.apache.parquet.schema.PrimitiveComparator; +import org.apache.parquet.schema.PrimitiveStringifier; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; + +import it.unimi.dsi.fastutil.booleans.BooleanArrayList; +import it.unimi.dsi.fastutil.booleans.BooleanList; +import it.unimi.dsi.fastutil.booleans.BooleanLists; +import it.unimi.dsi.fastutil.ints.IntArrayList; +import it.unimi.dsi.fastutil.ints.IntList; +import it.unimi.dsi.fastutil.ints.IntOpenHashSet; +import it.unimi.dsi.fastutil.ints.IntSet; +import it.unimi.dsi.fastutil.longs.LongArrayList; +import it.unimi.dsi.fastutil.longs.LongList; +import it.unimi.dsi.fastutil.longs.LongLists; + +/** + * Builder implementation to create {@link ColumnIndex} objects. + */ +public abstract class ColumnIndexBuilder { + + static abstract class ColumnIndexBase implements ColumnIndex { + /* + * A class containing the value to be compared to the min/max values. This way we only need to do the deboxing once + * per predicate execution instead for every comparison. + */ + abstract class ValueComparator { + abstract int compareValueToMin(int arrayIndex); + + abstract int compareValueToMax(int arrayIndex); + + int arrayLength() { + return pageIndexes.length; + } + + int translate(int arrayIndex) { + return pageIndexes[arrayIndex]; + } + } + + private static final ByteBuffer EMPTY_BYTE_BUFFER = ByteBuffer.allocate(0); + private static final int MAX_VALUE_LENGTH_FOR_TOSTRING = 40; + private static final String TOSTRING_TRUNCATION_MARKER = "(...)"; + private static final int TOSTRING_TRUNCATION_START_POS = (MAX_VALUE_LENGTH_FOR_TOSTRING + - TOSTRING_TRUNCATION_MARKER.length()) / 2; + private static final int TOSTRING_TRUNCATION_END_POS = MAX_VALUE_LENGTH_FOR_TOSTRING + - TOSTRING_TRUNCATION_MARKER.length() - TOSTRING_TRUNCATION_START_POS; + private static final String TOSTRING_MISSING_VALUE_MARKER = ""; + + final PrimitiveStringifier stringifier; + final PrimitiveComparator comparator; + private boolean[] nullPages; + private BoundaryOrder boundaryOrder; + // Storing the page index for each array index (min/max values are not stored for null-pages) + private int[] pageIndexes; + // might be null + private long[] nullCounts; + + static String truncate(String str) { + if (str.length() <= MAX_VALUE_LENGTH_FOR_TOSTRING) { + return str; + } + return str.substring(0, TOSTRING_TRUNCATION_START_POS) + TOSTRING_TRUNCATION_MARKER + + str.substring(str.length() - TOSTRING_TRUNCATION_END_POS); + } + + ColumnIndexBase(PrimitiveType type) { + comparator = type.comparator(); + stringifier = type.stringifier(); + } + + @Override + public BoundaryOrder getBoundaryOrder() { + return boundaryOrder; + } + + @Override + public List getNullCounts() { + if (nullCounts == null) { + return null; + } + return LongLists.unmodifiable(LongArrayList.wrap(nullCounts)); + } + + @Override + public List getNullPages() { + return BooleanLists.unmodifiable(BooleanArrayList.wrap(nullPages)); + } + + @Override + public List getMinValues() { + List list = new ArrayList<>(getPageCount()); + int arrayIndex = 0; + for (int i = 0, n = getPageCount(); i < n; ++i) { + if (isNullPage(i)) { + list.add(EMPTY_BYTE_BUFFER); + } else { + list.add(getMinValueAsBytes(arrayIndex++)); + } + } + return list; + } + + @Override + public List getMaxValues() { + List list = new ArrayList<>(getPageCount()); + int arrayIndex = 0; + for (int i = 0, n = getPageCount(); i < n; ++i) { + if (isNullPage(i)) { + list.add(EMPTY_BYTE_BUFFER); + } else { + list.add(getMaxValueAsBytes(arrayIndex++)); + } + } + return list; + } + + @Override + public String toString() { + try (Formatter formatter = new Formatter()) { + formatter.format("Boudary order: %s\n", boundaryOrder); + String minMaxPart = " %-" + MAX_VALUE_LENGTH_FOR_TOSTRING + "s %-" + MAX_VALUE_LENGTH_FOR_TOSTRING + "s\n"; + formatter.format("%-10s %20s" + minMaxPart, "", "null count", "min", "max"); + String format = "page-%-5d %20s" + minMaxPart; + int arrayIndex = 0; + for (int i = 0, n = nullPages.length; i < n; ++i) { + String nullCount = nullCounts == null ? TOSTRING_MISSING_VALUE_MARKER : Long.toString(nullCounts[i]); + String min, max; + if (nullPages[i]) { + min = max = TOSTRING_MISSING_VALUE_MARKER; + } else { + min = truncate(getMinValueAsString(arrayIndex)); + max = truncate(getMaxValueAsString(arrayIndex++)); + } + formatter.format(format, i, nullCount, min, max); + } + return formatter.toString(); + } + } + + int getPageCount() { + return nullPages.length; + } + + boolean isNullPage(int pageIndex) { + return nullPages[pageIndex]; + } + + /* + * Returns the min value for arrayIndex as a ByteBuffer. (Min values are not stored for null-pages so arrayIndex + * might not equal to pageIndex.) + */ + abstract ByteBuffer getMinValueAsBytes(int arrayIndex); + + /* + * Returns the max value for arrayIndex as a ByteBuffer. (Max values are not stored for null-pages so arrayIndex + * might not equal to pageIndex.) + */ + abstract ByteBuffer getMaxValueAsBytes(int arrayIndex); + + /* + * Returns the min value for arrayIndex as a String. (Min values are not stored for null-pages so arrayIndex might + * not equal to pageIndex.) + */ + abstract String getMinValueAsString(int arrayIndex); + + /* + * Returns the max value for arrayIndex as a String. (Max values are not stored for null-pages so arrayIndex might + * not equal to pageIndex.) + */ + abstract String getMaxValueAsString(int arrayIndex); + + /* Creates a Statistics object for filtering. Used for user defined predicates. */ + abstract > org.apache.parquet.filter2.predicate.Statistics createStats(int arrayIndex); + + /* Creates a ValueComparator object containing the specified value to be compared for min/max values */ + abstract ValueComparator createValueComparator(Object value); + + @Override + public PrimitiveIterator.OfInt visit(And and) { + throw new UnsupportedOperationException("AND shall not be used on column index directly"); + } + + @Override + public PrimitiveIterator.OfInt visit(Not not) { + throw new UnsupportedOperationException("NOT shall not be used on column index directly"); + } + + @Override + public PrimitiveIterator.OfInt visit(Or or) { + throw new UnsupportedOperationException("OR shall not be used on column index directly"); + } + + @Override + public > PrimitiveIterator.OfInt visit(Eq eq) { + T value = eq.getValue(); + if (value == null) { + if (nullCounts == null) { + // Searching for nulls so if we don't have null related statistics we have to return all pages + return IndexIterator.all(getPageCount()); + } else { + return IndexIterator.filter(getPageCount(), pageIndex -> nullCounts[pageIndex] > 0); + } + } + return getBoundaryOrder().eq(createValueComparator(value)); + } + + @Override + public > PrimitiveIterator.OfInt visit(Gt gt) { + return getBoundaryOrder().gt(createValueComparator(gt.getValue())); + } + + @Override + public > PrimitiveIterator.OfInt visit(GtEq gtEq) { + return getBoundaryOrder().gtEq(createValueComparator(gtEq.getValue())); + } + + @Override + public > PrimitiveIterator.OfInt visit(Lt lt) { + return getBoundaryOrder().lt(createValueComparator(lt.getValue())); + } + + @Override + public > PrimitiveIterator.OfInt visit(LtEq ltEq) { + return getBoundaryOrder().ltEq(createValueComparator(ltEq.getValue())); + } + + @Override + public > PrimitiveIterator.OfInt visit(NotEq notEq) { + T value = notEq.getValue(); + if (value == null) { + return IndexIterator.filter(getPageCount(), pageIndex -> !nullPages[pageIndex]); + } + + if (nullCounts == null) { + // Nulls match so if we don't have null related statistics we have to return all pages + return IndexIterator.all(getPageCount()); + } + + // Merging value filtering with pages containing nulls + IntSet matchingIndexes = new IntOpenHashSet(); + getBoundaryOrder().notEq(createValueComparator(value)) + .forEachRemaining((int index) -> matchingIndexes.add(index)); + return IndexIterator.filter(getPageCount(), + pageIndex -> nullCounts[pageIndex] > 0 || matchingIndexes.contains(pageIndex)); + } + + @Override + public , U extends UserDefinedPredicate> PrimitiveIterator.OfInt visit( + UserDefined udp) { + final UserDefinedPredicate predicate = udp.getUserDefinedPredicate(); + final boolean acceptNulls = predicate.keep(null); + + if (acceptNulls && nullCounts == null) { + // Nulls match so if we don't have null related statistics we have to return all pages + return IndexIterator.all(getPageCount()); + } + + return IndexIterator.filter(getPageCount(), new IntPredicate() { + private int arrayIndex = -1; + + @Override + public boolean test(int pageIndex) { + if (isNullPage(pageIndex)) { + return acceptNulls; + } else { + ++arrayIndex; + if (acceptNulls && nullCounts[pageIndex] > 0) { + return true; + } + org.apache.parquet.filter2.predicate.Statistics stats = createStats(arrayIndex); + return !predicate.canDrop(stats); + } + } + }); + } + + @Override + public , U extends UserDefinedPredicate> PrimitiveIterator.OfInt visit( + LogicalNotUserDefined udp) { + final UserDefinedPredicate inversePredicate = udp.getUserDefined().getUserDefinedPredicate(); + final boolean acceptNulls = !inversePredicate.keep(null); + + if (acceptNulls && nullCounts == null) { + // Nulls match so if we don't have null related statistics we have to return all pages + return IndexIterator.all(getPageCount()); + } + + return IndexIterator.filter(getPageCount(), new IntPredicate() { + private int arrayIndex = -1; + + @Override + public boolean test(int pageIndex) { + if (isNullPage(pageIndex)) { + return acceptNulls; + } else { + ++arrayIndex; + if (acceptNulls && nullCounts[pageIndex] > 0) { + return true; + } + org.apache.parquet.filter2.predicate.Statistics stats = createStats(arrayIndex); + return !inversePredicate.inverseCanDrop(stats); + } + } + }); + } + } + + private static final ColumnIndexBuilder NO_OP_BUILDER = new ColumnIndexBuilder() { + @Override + public ColumnIndex build() { + return null; + } + + @Override + public void add(Statistics stats) { + } + + @Override + void addMinMax(Object min, Object max) { + } + + @Override + ColumnIndexBase createColumnIndex(PrimitiveType type) { + return null; + } + + @Override + void clearMinMax() { + } + + @Override + void addMinMaxFromBytes(ByteBuffer min, ByteBuffer max) { + } + + @Override + int compareMinValues(PrimitiveComparator comparator, int index1, int index2) { + return 0; + } + + @Override + int compareMaxValues(PrimitiveComparator comparator, int index1, int index2) { + return 0; + } + + @Override + int sizeOf(Object value) { + return 0; + } + }; + + private static final Map BUILDERS = new EnumMap<>(PrimitiveTypeName.class); + + private PrimitiveType type; + private final BooleanList nullPages = new BooleanArrayList(); + private final LongList nullCounts = new LongArrayList(); + private long minMaxSize; + private final IntList pageIndexes = new IntArrayList(); + private int nextPageIndex; + + /** + * @return a no-op builder that does not collect statistics objects and therefore returns {@code null} at + * {@link #build()}. + */ + public static ColumnIndexBuilder getNoOpBuilder() { + return NO_OP_BUILDER; + } + + /** + * @param type + * the type this builder is to be created for + * @param truncateLength + * the length to be used for truncating binary values if possible + * @return a {@link ColumnIndexBuilder} instance to be used for creating {@link ColumnIndex} objects + */ + public static ColumnIndexBuilder getBuilder(PrimitiveType type, int truncateLength) { + ColumnIndexBuilder builder = createNewBuilder(type, truncateLength); + builder.type = type; + return builder; + } + + private static ColumnIndexBuilder createNewBuilder(PrimitiveType type, int truncateLength) { + switch (type.getPrimitiveTypeName()) { + case BINARY: + case FIXED_LEN_BYTE_ARRAY: + case INT96: + return new BinaryColumnIndexBuilder(type, truncateLength); + case BOOLEAN: + return new BooleanColumnIndexBuilder(); + case DOUBLE: + return new DoubleColumnIndexBuilder(); + case FLOAT: + return new FloatColumnIndexBuilder(); + case INT32: + return new IntColumnIndexBuilder(); + case INT64: + return new LongColumnIndexBuilder(); + default: + throw new IllegalArgumentException("Unsupported type for column index: " + type); + } + } + + /** + * @param type + * the primitive type + * @param boundaryOrder + * the boundary order of the min/max values + * @param nullPages + * the null pages (one boolean value for each page that signifies whether the page consists of nulls + * entirely) + * @param nullCounts + * the number of null values for each page + * @param minValues + * the min values for each page + * @param maxValues + * the max values for each page + * @return the newly created {@link ColumnIndex} object based on the specified arguments + */ + public static ColumnIndex build( + PrimitiveType type, + BoundaryOrder boundaryOrder, + List nullPages, + List nullCounts, + List minValues, + List maxValues) { + + PrimitiveTypeName typeName = type.getPrimitiveTypeName(); + ColumnIndexBuilder builder = BUILDERS.get(typeName); + if (builder == null) { + builder = createNewBuilder(type, Integer.MAX_VALUE); + BUILDERS.put(typeName, builder); + } + + builder.fill(nullPages, nullCounts, minValues, maxValues); + ColumnIndexBase columnIndex = builder.build(type); + columnIndex.boundaryOrder = requireNonNull(boundaryOrder); + return columnIndex; + } + + ColumnIndexBuilder() { + // Shall be able to be created inside this package only + } + + /** + * Adds the data from the specified statistics to this builder + * + * @param stats + * the statistics to be added + */ + public void add(Statistics stats) { + if (stats.hasNonNullValue()) { + nullPages.add(false); + Object min = stats.genericGetMin(); + Object max = stats.genericGetMax(); + addMinMax(min, max); + pageIndexes.add(nextPageIndex); + minMaxSize += sizeOf(min); + minMaxSize += sizeOf(max); + } else { + nullPages.add(true); + } + nullCounts.add(stats.getNumNulls()); + ++nextPageIndex; + } + + abstract void addMinMaxFromBytes(ByteBuffer min, ByteBuffer max); + + abstract void addMinMax(Object min, Object max); + + private void fill(List nullPages, List nullCounts, List minValues, + List maxValues) { + clear(); + int pageCount = nullPages.size(); + if ((nullCounts != null && nullCounts.size() != pageCount) || minValues.size() != pageCount + || maxValues.size() != pageCount) { + throw new IllegalArgumentException( + String.format("Not all sizes are equal (nullPages:%d, nullCounts:%s, minValues:%d, maxValues:%d", + nullPages.size(), nullCounts == null ? "null" : nullCounts.size(), minValues.size(), maxValues.size())); + } + this.nullPages.addAll(nullPages); + // Nullcounts is optional in the format + if (nullCounts != null) { + this.nullCounts.addAll(nullCounts); + } + + for (int i = 0; i < pageCount; ++i) { + if (!nullPages.get(i)) { + ByteBuffer min = minValues.get(i); + ByteBuffer max = maxValues.get(i); + addMinMaxFromBytes(min, max); + pageIndexes.add(i); + minMaxSize += min.remaining(); + minMaxSize += max.remaining(); + } + } + } + + /** + * @return the newly created column index or {@code null} if the {@link ColumnIndex} would be empty + */ + public ColumnIndex build() { + ColumnIndexBase columnIndex = build(type); + if (columnIndex == null) { + return null; + } + columnIndex.boundaryOrder = calculateBoundaryOrder(type.comparator()); + return columnIndex; + } + + private ColumnIndexBase build(PrimitiveType type) { + if (nullPages.isEmpty()) { + return null; + } + ColumnIndexBase columnIndex = createColumnIndex(type); + if (columnIndex == null) { + // Might happen if the specialized builder discovers invalid min/max values + return null; + } + columnIndex.nullPages = nullPages.toBooleanArray(); + // Null counts is optional so keep it null if the builder has no values + if (!nullCounts.isEmpty()) { + columnIndex.nullCounts = nullCounts.toLongArray(); + } + columnIndex.pageIndexes = pageIndexes.toIntArray(); + + return columnIndex; + } + + private BoundaryOrder calculateBoundaryOrder(PrimitiveComparator comparator) { + if (isAscending(comparator)) { + return BoundaryOrder.ASCENDING; + } else if (isDescending(comparator)) { + return BoundaryOrder.DESCENDING; + } else { + return BoundaryOrder.UNORDERED; + } + } + + // min[i] <= min[i+1] && max[i] <= max[i+1] + private boolean isAscending(PrimitiveComparator comparator) { + for (int i = 1, n = pageIndexes.size(); i < n; ++i) { + if (compareMinValues(comparator, i - 1, i) > 0 || compareMaxValues(comparator, i - 1, i) > 0) { + return false; + } + } + return true; + } + + // min[i] >= min[i+1] && max[i] >= max[i+1] + private boolean isDescending(PrimitiveComparator comparator) { + for (int i = 1, n = pageIndexes.size(); i < n; ++i) { + if (compareMinValues(comparator, i - 1, i) < 0 || compareMaxValues(comparator, i - 1, i) < 0) { + return false; + } + } + return true; + } + + abstract int compareMinValues(PrimitiveComparator comparator, int index1, int index2); + + abstract int compareMaxValues(PrimitiveComparator comparator, int index1, int index2); + + private void clear() { + nullPages.clear(); + nullCounts.clear(); + clearMinMax(); + minMaxSize = 0; + nextPageIndex = 0; + pageIndexes.clear(); + } + + abstract void clearMinMax(); + + abstract ColumnIndexBase createColumnIndex(PrimitiveType type); + + abstract int sizeOf(Object value); + + /** + * @return the number of pages added so far to this builder + */ + public int getPageCount() { + return nullPages.size(); + } + + /** + * @return the sum of size in bytes of the min/max values added so far to this builder + */ + public long getMinMaxSize() { + return minMaxSize; + } +} diff --git a/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/DoubleColumnIndexBuilder.java b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/DoubleColumnIndexBuilder.java new file mode 100644 index 0000000000..074d02573f --- /dev/null +++ b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/DoubleColumnIndexBuilder.java @@ -0,0 +1,155 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.internal.column.columnindex; + +import static java.nio.ByteOrder.LITTLE_ENDIAN; + +import java.nio.ByteBuffer; + +import org.apache.parquet.filter2.predicate.Statistics; +import org.apache.parquet.io.api.Binary; +import org.apache.parquet.schema.PrimitiveComparator; +import org.apache.parquet.schema.PrimitiveType; + +import it.unimi.dsi.fastutil.doubles.DoubleArrayList; +import it.unimi.dsi.fastutil.doubles.DoubleList; + +class DoubleColumnIndexBuilder extends ColumnIndexBuilder { + private static class DoubleColumnIndex extends ColumnIndexBase { + private double[] minValues; + private double[] maxValues; + + private DoubleColumnIndex(PrimitiveType type) { + super(type); + } + + @Override + ByteBuffer getMinValueAsBytes(int pageIndex) { + return convert(minValues[pageIndex]); + } + + @Override + ByteBuffer getMaxValueAsBytes(int pageIndex) { + return convert(maxValues[pageIndex]); + } + + @Override + String getMinValueAsString(int pageIndex) { + return stringifier.stringify(minValues[pageIndex]); + } + + @Override + String getMaxValueAsString(int pageIndex) { + return stringifier.stringify(maxValues[pageIndex]); + } + + @Override + @SuppressWarnings("unchecked") + > Statistics createStats(int arrayIndex) { + return (Statistics) new Statistics(minValues[arrayIndex], maxValues[arrayIndex], comparator); + } + + @Override + ValueComparator createValueComparator(Object value) { + final double v = (double) value; + return new ValueComparator() { + @Override + int compareValueToMin(int arrayIndex) { + return comparator.compare(v, minValues[arrayIndex]); + } + + @Override + int compareValueToMax(int arrayIndex) { + return comparator.compare(v, maxValues[arrayIndex]); + } + }; + } + } + + private final DoubleList minValues = new DoubleArrayList(); + private final DoubleList maxValues = new DoubleArrayList(); + private boolean invalid; + + private static double convert(ByteBuffer buffer) { + return buffer.order(LITTLE_ENDIAN).getDouble(0); + } + + private static ByteBuffer convert(double value) { + return ByteBuffer.allocate(Double.BYTES).order(LITTLE_ENDIAN).putDouble(0, value); + } + + @Override + void addMinMaxFromBytes(ByteBuffer min, ByteBuffer max) { + minValues.add(convert(min)); + maxValues.add(convert(max)); + } + + @Override + void addMinMax(Object min, Object max) { + double dMin = (double) min; + double dMax = (double) max; + if (Double.isNaN(dMin) || Double.isNaN(dMax)) { + // Invalidate this column index in case of NaN as the sorting order of values is undefined for this case + invalid = true; + } + + // Sorting order is undefined for -0.0 so let min = -0.0 and max = +0.0 to ensure that no 0.0 values are skipped + if (Double.compare(dMin, +0.0) == 0) { + dMin = -0.0; + } + if (Double.compare(dMax, -0.0) == 0) { + dMax = +0.0; + } + + minValues.add(dMin); + maxValues.add(dMax); + } + + @Override + ColumnIndexBase createColumnIndex(PrimitiveType type) { + if (invalid) { + return null; + } + DoubleColumnIndex columnIndex = new DoubleColumnIndex(type); + columnIndex.minValues = minValues.toDoubleArray(); + columnIndex.maxValues = maxValues.toDoubleArray(); + return columnIndex; + } + + @Override + void clearMinMax() { + minValues.clear(); + maxValues.clear(); + } + + @Override + int compareMinValues(PrimitiveComparator comparator, int index1, int index2) { + return comparator.compare(minValues.get(index1), minValues.get(index2)); + } + + @Override + int compareMaxValues(PrimitiveComparator comparator, int index1, int index2) { + return comparator.compare(maxValues.get(index1), maxValues.get(index2)); + } + + @Override + int sizeOf(Object value) { + return Double.BYTES; + } +} diff --git a/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/FloatColumnIndexBuilder.java b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/FloatColumnIndexBuilder.java new file mode 100644 index 0000000000..cbcdf949d8 --- /dev/null +++ b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/FloatColumnIndexBuilder.java @@ -0,0 +1,155 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.internal.column.columnindex; + +import static java.nio.ByteOrder.LITTLE_ENDIAN; + +import java.nio.ByteBuffer; + +import org.apache.parquet.filter2.predicate.Statistics; +import org.apache.parquet.io.api.Binary; +import org.apache.parquet.schema.PrimitiveComparator; +import org.apache.parquet.schema.PrimitiveType; + +import it.unimi.dsi.fastutil.floats.FloatArrayList; +import it.unimi.dsi.fastutil.floats.FloatList; + +class FloatColumnIndexBuilder extends ColumnIndexBuilder { + private static class FloatColumnIndex extends ColumnIndexBase { + private float[] minValues; + private float[] maxValues; + + private FloatColumnIndex(PrimitiveType type) { + super(type); + } + + @Override + ByteBuffer getMinValueAsBytes(int pageIndex) { + return convert(minValues[pageIndex]); + } + + @Override + ByteBuffer getMaxValueAsBytes(int pageIndex) { + return convert(maxValues[pageIndex]); + } + + @Override + String getMinValueAsString(int pageIndex) { + return stringifier.stringify(minValues[pageIndex]); + } + + @Override + String getMaxValueAsString(int pageIndex) { + return stringifier.stringify(maxValues[pageIndex]); + } + + @Override + @SuppressWarnings("unchecked") + > Statistics createStats(int arrayIndex) { + return (Statistics) new Statistics(minValues[arrayIndex], maxValues[arrayIndex], comparator); + } + + @Override + ValueComparator createValueComparator(Object value) { + final float v = (float) value; + return new ValueComparator() { + @Override + int compareValueToMin(int arrayIndex) { + return comparator.compare(v, minValues[arrayIndex]); + } + + @Override + int compareValueToMax(int arrayIndex) { + return comparator.compare(v, maxValues[arrayIndex]); + } + }; + } + } + + private final FloatList minValues = new FloatArrayList(); + private final FloatList maxValues = new FloatArrayList(); + private boolean invalid; + + private static float convert(ByteBuffer buffer) { + return buffer.order(LITTLE_ENDIAN).getFloat(0); + } + + private static ByteBuffer convert(float value) { + return ByteBuffer.allocate(Float.BYTES).order(LITTLE_ENDIAN).putFloat(0, value); + } + + @Override + void addMinMaxFromBytes(ByteBuffer min, ByteBuffer max) { + minValues.add(convert(min)); + maxValues.add(convert(max)); + } + + @Override + void addMinMax(Object min, Object max) { + float fMin = (float) min; + float fMax = (float) max; + if (Float.isNaN(fMin) || Float.isNaN(fMax)) { + // Invalidate this column index in case of NaN as the sorting order of values is undefined for this case + invalid = true; + } + + // Sorting order is undefined for -0.0 so let min = -0.0 and max = +0.0 to ensure that no 0.0 values are skipped + if (Float.compare(fMin, +0.0f) == 0) { + fMin = -0.0f; + } + if (Float.compare(fMax, -0.0f) == 0) { + fMax = +0.0f; + } + + minValues.add(fMin); + maxValues.add(fMax); + } + + @Override + ColumnIndexBase createColumnIndex(PrimitiveType type) { + if (invalid) { + return null; + } + FloatColumnIndex columnIndex = new FloatColumnIndex(type); + columnIndex.minValues = minValues.toFloatArray(); + columnIndex.maxValues = maxValues.toFloatArray(); + return columnIndex; + } + + @Override + void clearMinMax() { + minValues.clear(); + maxValues.clear(); + } + + @Override + int compareMinValues(PrimitiveComparator comparator, int index1, int index2) { + return comparator.compare(minValues.get(index1), minValues.get(index2)); + } + + @Override + int compareMaxValues(PrimitiveComparator comparator, int index1, int index2) { + return comparator.compare(maxValues.get(index1), maxValues.get(index2)); + } + + @Override + int sizeOf(Object value) { + return Float.BYTES; + } +} diff --git a/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/IndexIterator.java b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/IndexIterator.java new file mode 100644 index 0000000000..9eab65e5bb --- /dev/null +++ b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/IndexIterator.java @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.internal.column.columnindex; + +import java.util.NoSuchElementException; +import java.util.PrimitiveIterator; +import java.util.function.IntPredicate; +import java.util.function.IntUnaryOperator; + +import org.apache.parquet.internal.column.columnindex.ColumnIndexBuilder.ColumnIndexBase; + +/** + * Iterator implementation for page indexes. + */ +class IndexIterator implements PrimitiveIterator.OfInt { + public static final PrimitiveIterator.OfInt EMPTY = new OfInt() { + @Override + public boolean hasNext() { + return false; + } + + @Override + public int nextInt() { + throw new NoSuchElementException(); + } + }; + private int index; + private final int endIndex; + private final IntPredicate filter; + private final IntUnaryOperator translator; + + static PrimitiveIterator.OfInt all(int pageCount) { + return new IndexIterator(0, pageCount, i -> true, i -> i); + } + + static PrimitiveIterator.OfInt all(ColumnIndexBase.ValueComparator comparator) { + return new IndexIterator(0, comparator.arrayLength(), i -> true, comparator::translate); + } + + static PrimitiveIterator.OfInt filter(int pageCount, IntPredicate filter) { + return new IndexIterator(0, pageCount, filter, i -> i); + } + + static PrimitiveIterator.OfInt filterTranslate(int arrayLength, IntPredicate filter, IntUnaryOperator translator) { + return new IndexIterator(0, arrayLength, filter, translator); + } + + static PrimitiveIterator.OfInt rangeTranslate(int from, int to, IntUnaryOperator translator) { + return new IndexIterator(from, to + 1, i -> true, translator); + } + + private IndexIterator(int startIndex, int endIndex, IntPredicate filter, IntUnaryOperator translator) { + this.endIndex = endIndex; + this.filter = filter; + this.translator = translator; + index = nextPageIndex(startIndex); + } + + private int nextPageIndex(int startIndex) { + for (int i = startIndex; i < endIndex; ++i) { + if (filter.test(i)) { + return i; + } + } + return -1; + } + + @Override + public boolean hasNext() { + return index >= 0; + } + + @Override + public int nextInt() { + if (hasNext()) { + int ret = index; + index = nextPageIndex(index + 1); + return translator.applyAsInt(ret); + } + throw new NoSuchElementException(); + } +} diff --git a/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/IntColumnIndexBuilder.java b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/IntColumnIndexBuilder.java new file mode 100644 index 0000000000..2d19d270f6 --- /dev/null +++ b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/IntColumnIndexBuilder.java @@ -0,0 +1,136 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.internal.column.columnindex; + +import static java.nio.ByteOrder.LITTLE_ENDIAN; + +import java.nio.ByteBuffer; + +import org.apache.parquet.filter2.predicate.Statistics; +import org.apache.parquet.io.api.Binary; +import org.apache.parquet.schema.PrimitiveComparator; +import org.apache.parquet.schema.PrimitiveType; + +import it.unimi.dsi.fastutil.ints.IntArrayList; +import it.unimi.dsi.fastutil.ints.IntList; + +class IntColumnIndexBuilder extends ColumnIndexBuilder { + private static class IntColumnIndex extends ColumnIndexBase { + private int[] minValues; + private int[] maxValues; + + private IntColumnIndex(PrimitiveType type) { + super(type); + } + + @Override + ByteBuffer getMinValueAsBytes(int pageIndex) { + return convert(minValues[pageIndex]); + } + + @Override + ByteBuffer getMaxValueAsBytes(int pageIndex) { + return convert(maxValues[pageIndex]); + } + + @Override + String getMinValueAsString(int pageIndex) { + return stringifier.stringify(minValues[pageIndex]); + } + + @Override + String getMaxValueAsString(int pageIndex) { + return stringifier.stringify(maxValues[pageIndex]); + } + + @Override + @SuppressWarnings("unchecked") + > Statistics createStats(int arrayIndex) { + return (Statistics) new Statistics(minValues[arrayIndex], maxValues[arrayIndex], comparator); + } + + @Override + ValueComparator createValueComparator(Object value) { + final int v = (int) value; + return new ValueComparator() { + @Override + int compareValueToMin(int arrayIndex) { + return comparator.compare(v, minValues[arrayIndex]); + } + + @Override + int compareValueToMax(int arrayIndex) { + return comparator.compare(v, maxValues[arrayIndex]); + } + }; + } + } + + private final IntList minValues = new IntArrayList(); + private final IntList maxValues = new IntArrayList(); + + private static int convert(ByteBuffer buffer) { + return buffer.order(LITTLE_ENDIAN).getInt(0); + } + + private static ByteBuffer convert(int value) { + return ByteBuffer.allocate(Integer.BYTES).order(LITTLE_ENDIAN).putInt(0, value); + } + + @Override + void addMinMaxFromBytes(ByteBuffer min, ByteBuffer max) { + minValues.add(convert(min)); + maxValues.add(convert(max)); + } + + @Override + void addMinMax(Object min, Object max) { + minValues.add((int) min); + maxValues.add((int) max); + } + + @Override + ColumnIndexBase createColumnIndex(PrimitiveType type) { + IntColumnIndex columnIndex = new IntColumnIndex(type); + columnIndex.minValues = minValues.toIntArray(); + columnIndex.maxValues = maxValues.toIntArray(); + return columnIndex; + } + + @Override + void clearMinMax() { + minValues.clear(); + maxValues.clear(); + } + + @Override + int compareMinValues(PrimitiveComparator comparator, int index1, int index2) { + return comparator.compare(minValues.get(index1), minValues.get(index2)); + } + + @Override + int compareMaxValues(PrimitiveComparator comparator, int index1, int index2) { + return comparator.compare(maxValues.get(index1), maxValues.get(index2)); + } + + @Override + int sizeOf(Object value) { + return Integer.BYTES; + } +} diff --git a/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/LongColumnIndexBuilder.java b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/LongColumnIndexBuilder.java new file mode 100644 index 0000000000..b0189b7098 --- /dev/null +++ b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/LongColumnIndexBuilder.java @@ -0,0 +1,136 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.internal.column.columnindex; + +import static java.nio.ByteOrder.LITTLE_ENDIAN; + +import java.nio.ByteBuffer; + +import org.apache.parquet.filter2.predicate.Statistics; +import org.apache.parquet.io.api.Binary; +import org.apache.parquet.schema.PrimitiveComparator; +import org.apache.parquet.schema.PrimitiveType; + +import it.unimi.dsi.fastutil.longs.LongArrayList; +import it.unimi.dsi.fastutil.longs.LongList; + +class LongColumnIndexBuilder extends ColumnIndexBuilder { + private static class LongColumnIndex extends ColumnIndexBase { + private long[] minValues; + private long[] maxValues; + + private LongColumnIndex(PrimitiveType type) { + super(type); + } + + @Override + ByteBuffer getMinValueAsBytes(int pageIndex) { + return convert(minValues[pageIndex]); + } + + @Override + ByteBuffer getMaxValueAsBytes(int pageIndex) { + return convert(maxValues[pageIndex]); + } + + @Override + String getMinValueAsString(int pageIndex) { + return stringifier.stringify(minValues[pageIndex]); + } + + @Override + String getMaxValueAsString(int pageIndex) { + return stringifier.stringify(maxValues[pageIndex]); + } + + @Override + @SuppressWarnings("unchecked") + > Statistics createStats(int arrayIndex) { + return (Statistics) new Statistics(minValues[arrayIndex], maxValues[arrayIndex], comparator); + } + + @Override + ValueComparator createValueComparator(Object value) { + final long v = (long) value; + return new ValueComparator() { + @Override + int compareValueToMin(int arrayIndex) { + return comparator.compare(v, minValues[arrayIndex]); + } + + @Override + int compareValueToMax(int arrayIndex) { + return comparator.compare(v, maxValues[arrayIndex]); + } + }; + } + } + + private final LongList minValues = new LongArrayList(); + private final LongList maxValues = new LongArrayList(); + + private static long convert(ByteBuffer buffer) { + return buffer.order(LITTLE_ENDIAN).getLong(0); + } + + private static ByteBuffer convert(long value) { + return ByteBuffer.allocate(Long.BYTES).order(LITTLE_ENDIAN).putLong(0, value); + } + + @Override + void addMinMaxFromBytes(ByteBuffer min, ByteBuffer max) { + minValues.add(convert(min)); + maxValues.add(convert(max)); + } + + @Override + void addMinMax(Object min, Object max) { + minValues.add((long) min); + maxValues.add((long) max); + } + + @Override + ColumnIndexBase createColumnIndex(PrimitiveType type) { + LongColumnIndex columnIndex = new LongColumnIndex(type); + columnIndex.minValues = minValues.toLongArray(); + columnIndex.maxValues = maxValues.toLongArray(); + return columnIndex; + } + + @Override + void clearMinMax() { + minValues.clear(); + maxValues.clear(); + } + + @Override + int compareMinValues(PrimitiveComparator comparator, int index1, int index2) { + return comparator.compare(minValues.get(index1), minValues.get(index2)); + } + + @Override + int compareMaxValues(PrimitiveComparator comparator, int index1, int index2) { + return comparator.compare(maxValues.get(index1), maxValues.get(index2)); + } + + @Override + int sizeOf(Object value) { + return Long.BYTES; + } +} diff --git a/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/OffsetIndex.java b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/OffsetIndex.java new file mode 100644 index 0000000000..ba984ebc70 --- /dev/null +++ b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/OffsetIndex.java @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.internal.column.columnindex; + +/** + * Offset index containing the offset and size of the page and the index of the first row in the page. + * + * @see org.apache.parquet.format.OffsetIndex + */ +public interface OffsetIndex { + /** + * @return the number of pages + */ + public int getPageCount(); + + /** + * @param pageIndex + * the index of the page + * @return the offset of the page in the file + */ + public long getOffset(int pageIndex); + + /** + * @param pageIndex + * the index of the page + * @return the compressed size of the page (including page header) + */ + public int getCompressedPageSize(int pageIndex); + + /** + * @param pageIndex + * the index of the page + * @return the index of the first row in the page + */ + public long getFirstRowIndex(int pageIndex); + + /** + * @param pageIndex + * the index of the page + * @param rowGroupRowCount + * the total number of rows in the row-group + * @return the calculated index of the last row of the given page + */ + public default long getLastRowIndex(int pageIndex, long rowGroupRowCount) { + int nextPageIndex = pageIndex + 1; + return (nextPageIndex >= getPageCount() ? rowGroupRowCount : getFirstRowIndex(nextPageIndex)) - 1; + } +} diff --git a/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/OffsetIndexBuilder.java b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/OffsetIndexBuilder.java new file mode 100644 index 0000000000..e4907b5488 --- /dev/null +++ b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/OffsetIndexBuilder.java @@ -0,0 +1,175 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.internal.column.columnindex; + +import java.util.Formatter; + +import it.unimi.dsi.fastutil.ints.IntArrayList; +import it.unimi.dsi.fastutil.ints.IntList; +import it.unimi.dsi.fastutil.longs.LongArrayList; +import it.unimi.dsi.fastutil.longs.LongList; + +/** + * Builder implementation to create {@link OffsetIndex} objects during writing a parquet file. + */ +public class OffsetIndexBuilder { + + private static class OffsetIndexImpl implements OffsetIndex { + private long[] offsets; + private int[] compressedPageSizes; + private long[] firstRowIndexes; + + @Override + public String toString() { + try (Formatter formatter = new Formatter()) { + formatter.format("%-10s %20s %16s %20s\n", "", "offset", "compressed size", "first row index"); + for (int i = 0, n = offsets.length; i < n; ++i) { + formatter.format("page-%-5d %20d %16d %20d\n", i, offsets[i], compressedPageSizes[i], firstRowIndexes[i]); + } + return formatter.toString(); + } + } + + @Override + public int getPageCount() { + return offsets.length; + } + + @Override + public long getOffset(int pageIndex) { + return offsets[pageIndex]; + } + + @Override + public int getCompressedPageSize(int pageIndex) { + return compressedPageSizes[pageIndex]; + } + + @Override + public long getFirstRowIndex(int pageIndex) { + return firstRowIndexes[pageIndex]; + } + } + + private static final OffsetIndexBuilder NO_OP_BUILDER = new OffsetIndexBuilder() { + @Override + public void add(int compressedPageSize, long rowCount) { + } + + @Override + public void add(long offset, int compressedPageSize, long rowCount) { + } + }; + + private final LongList offsets = new LongArrayList(); + private final IntList compressedPageSizes = new IntArrayList(); + private final LongList firstRowIndexes = new LongArrayList(); + private long previousOffset; + private int previousPageSize; + private long previousRowIndex; + private long previousRowCount; + + /** + * @return a no-op builder that does not collect values and therefore returns {@code null} at {@link #build(long)} + */ + public static OffsetIndexBuilder getNoOpBuilder() { + return NO_OP_BUILDER; + } + + /** + * @return an {@link OffsetIndexBuilder} instance to build an {@link OffsetIndex} object + */ + public static OffsetIndexBuilder getBuilder() { + return new OffsetIndexBuilder(); + } + + private OffsetIndexBuilder() { + } + + /** + * Adds the specified parameters to this builder. Used by the writers to building up {@link OffsetIndex} objects to be + * written to the Parquet file. + * + * @param compressedPageSize + * the size of the page (including header) + * @param rowCount + * the number of rows in the page + */ + public void add(int compressedPageSize, long rowCount) { + add(previousOffset + previousPageSize, compressedPageSize, previousRowIndex + previousRowCount); + previousRowCount = rowCount; + } + + /** + * Adds the specified parameters to this builder. Used by the metadata converter to building up {@link OffsetIndex} + * objects read from the Parquet file. + * + * @param offset + * the offset of the page in the file + * @param compressedPageSize + * the size of the page (including header) + * @param firstRowIndex + * the index of the first row in the page (within the row group) + */ + public void add(long offset, int compressedPageSize, long firstRowIndex) { + previousOffset = offset; + offsets.add(offset); + previousPageSize = compressedPageSize; + compressedPageSizes.add(compressedPageSize); + previousRowIndex = firstRowIndex; + firstRowIndexes.add(firstRowIndex); + } + + /** + * Builds the offset index. Used by the metadata converter to building up {@link OffsetIndex} + * objects read from the Parquet file. + * + * @return the newly created offset index or {@code null} if the {@link OffsetIndex} object would be empty + */ + public OffsetIndex build() { + return build(0); + } + + /** + * Builds the offset index. Used by the writers to building up {@link OffsetIndex} objects to be + * written to the Parquet file. + * + * @param firstPageOffset + * the actual offset in the file to be used to translate all the collected offsets + * @return the newly created offset index or {@code null} if the {@link OffsetIndex} object would be empty + */ + public OffsetIndex build(long firstPageOffset) { + if (compressedPageSizes.isEmpty()) { + return null; + } + long[] offsets = this.offsets.toLongArray(); + if (firstPageOffset != 0) { + for (int i = 0, n = offsets.length; i < n; ++i) { + offsets[i] += firstPageOffset; + } + } + OffsetIndexImpl offsetIndex = new OffsetIndexImpl(); + offsetIndex.offsets = offsets; + offsetIndex.compressedPageSizes = compressedPageSizes.toIntArray(); + offsetIndex.firstRowIndexes = firstRowIndexes.toLongArray(); + + return offsetIndex; + } + +} diff --git a/parquet-column/src/main/java/org/apache/parquet/internal/filter2/columnindex/ColumnIndexFilter.java b/parquet-column/src/main/java/org/apache/parquet/internal/filter2/columnindex/ColumnIndexFilter.java new file mode 100644 index 0000000000..fb3077e877 --- /dev/null +++ b/parquet-column/src/main/java/org/apache/parquet/internal/filter2/columnindex/ColumnIndexFilter.java @@ -0,0 +1,194 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.internal.filter2.columnindex; + +import java.util.PrimitiveIterator; +import java.util.Set; +import java.util.function.Function; + +import org.apache.parquet.filter2.compat.FilterCompat; +import org.apache.parquet.filter2.compat.FilterCompat.FilterPredicateCompat; +import org.apache.parquet.filter2.compat.FilterCompat.NoOpFilter; +import org.apache.parquet.filter2.compat.FilterCompat.UnboundRecordFilterCompat; +import org.apache.parquet.filter2.predicate.FilterPredicate.Visitor; +import org.apache.parquet.filter2.predicate.Operators.And; +import org.apache.parquet.filter2.predicate.Operators.Column; +import org.apache.parquet.filter2.predicate.Operators.Eq; +import org.apache.parquet.filter2.predicate.Operators.Gt; +import org.apache.parquet.filter2.predicate.Operators.GtEq; +import org.apache.parquet.filter2.predicate.Operators.LogicalNotUserDefined; +import org.apache.parquet.filter2.predicate.Operators.Lt; +import org.apache.parquet.filter2.predicate.Operators.LtEq; +import org.apache.parquet.filter2.predicate.Operators.Not; +import org.apache.parquet.filter2.predicate.Operators.NotEq; +import org.apache.parquet.filter2.predicate.Operators.Or; +import org.apache.parquet.filter2.predicate.Operators.UserDefined; +import org.apache.parquet.filter2.predicate.UserDefinedPredicate; +import org.apache.parquet.hadoop.metadata.ColumnPath; +import org.apache.parquet.internal.column.columnindex.ColumnIndex; +import org.apache.parquet.internal.column.columnindex.OffsetIndex; +import org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore.MissingOffsetIndexException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Filter implementation based on column indexes. + * No filtering will be applied for columns where no column index is available. + * Offset index is required for all the columns in the projection, therefore a {@link MissingOffsetIndexException} will + * be thrown from any {@code visit} methods if any of the required offset indexes is missing. + */ +public class ColumnIndexFilter implements Visitor { + + private static final Logger LOGGER = LoggerFactory.getLogger(ColumnIndexFilter.class); + private final ColumnIndexStore columnIndexStore; + private final Set columns; + private final long rowCount; + private RowRanges allRows; + + /** + * Calculates the row ranges containing the indexes of the rows might match the specified filter. + * + * @param filter + * to be used for filtering the rows + * @param columnIndexStore + * the store for providing column/offset indexes + * @param paths + * the paths of the columns used in the actual projection; a column not being part of the projection will be + * handled as containing {@code null} values only even if the column has values written in the file + * @param rowCount + * the total number of rows in the row-group + * @return the ranges of the possible matching row indexes; the returned ranges will contain all the rows if any of + * the required offset index is missing + */ + public static RowRanges calculateRowRanges(FilterCompat.Filter filter, ColumnIndexStore columnIndexStore, + Set paths, long rowCount) { + return filter.accept(new FilterCompat.Visitor() { + @Override + public RowRanges visit(FilterPredicateCompat filterPredicateCompat) { + try { + return filterPredicateCompat.getFilterPredicate() + .accept(new ColumnIndexFilter(columnIndexStore, paths, rowCount)); + } catch (MissingOffsetIndexException e) { + LOGGER.warn("Unable to do filtering", e); + return RowRanges.createSingle(rowCount); + } + } + + @Override + public RowRanges visit(UnboundRecordFilterCompat unboundRecordFilterCompat) { + return RowRanges.createSingle(rowCount); + } + + @Override + public RowRanges visit(NoOpFilter noOpFilter) { + return RowRanges.createSingle(rowCount); + } + }); + } + + private ColumnIndexFilter(ColumnIndexStore columnIndexStore, Set paths, long rowCount) { + this.columnIndexStore = columnIndexStore; + this.columns = paths; + this.rowCount = rowCount; + } + + private RowRanges allRows() { + if (allRows == null) { + allRows = RowRanges.createSingle(rowCount); + } + return allRows; + } + + @Override + public > RowRanges visit(Eq eq) { + return applyPredicate(eq.getColumn(), ci -> ci.visit(eq), eq.getValue() == null ? allRows() : RowRanges.EMPTY); + } + + @Override + public > RowRanges visit(NotEq notEq) { + return applyPredicate(notEq.getColumn(), ci -> ci.visit(notEq), + notEq.getValue() == null ? RowRanges.EMPTY : allRows()); + } + + @Override + public > RowRanges visit(Lt lt) { + return applyPredicate(lt.getColumn(), ci -> ci.visit(lt), RowRanges.EMPTY); + } + + @Override + public > RowRanges visit(LtEq ltEq) { + return applyPredicate(ltEq.getColumn(), ci -> ci.visit(ltEq), RowRanges.EMPTY); + } + + @Override + public > RowRanges visit(Gt gt) { + return applyPredicate(gt.getColumn(), ci -> ci.visit(gt), RowRanges.EMPTY); + } + + @Override + public > RowRanges visit(GtEq gtEq) { + return applyPredicate(gtEq.getColumn(), ci -> ci.visit(gtEq), RowRanges.EMPTY); + } + + @Override + public , U extends UserDefinedPredicate> RowRanges visit(UserDefined udp) { + return applyPredicate(udp.getColumn(), ci -> ci.visit(udp), + udp.getUserDefinedPredicate().keep(null) ? allRows() : RowRanges.EMPTY); + } + + @Override + public , U extends UserDefinedPredicate> RowRanges visit( + LogicalNotUserDefined udp) { + return applyPredicate(udp.getUserDefined().getColumn(), ci -> ci.visit(udp), + udp.getUserDefined().getUserDefinedPredicate().keep(null) ? RowRanges.EMPTY : allRows()); + } + + private RowRanges applyPredicate(Column column, Function func, + RowRanges rangesForMissingColumns) { + ColumnPath columnPath = column.getColumnPath(); + if (!columns.contains(columnPath)) { + return rangesForMissingColumns; + } + + OffsetIndex oi = columnIndexStore.getOffsetIndex(columnPath); + ColumnIndex ci = columnIndexStore.getColumnIndex(columnPath); + if (ci == null) { + LOGGER.warn("No column index for column {} is available; Unable to filter on this column", columnPath); + return allRows(); + } + + return RowRanges.create(rowCount, func.apply(ci), oi); + } + + @Override + public RowRanges visit(And and) { + return RowRanges.intersection(and.getLeft().accept(this), and.getRight().accept(this)); + } + + @Override + public RowRanges visit(Or or) { + return RowRanges.union(or.getLeft().accept(this), or.getRight().accept(this)); + } + + @Override + public RowRanges visit(Not not) { + throw new IllegalArgumentException( + "Predicates containing a NOT must be run through LogicalInverseRewriter. " + not); + } +} diff --git a/parquet-column/src/main/java/org/apache/parquet/internal/filter2/columnindex/ColumnIndexStore.java b/parquet-column/src/main/java/org/apache/parquet/internal/filter2/columnindex/ColumnIndexStore.java new file mode 100644 index 0000000000..c82861ac25 --- /dev/null +++ b/parquet-column/src/main/java/org/apache/parquet/internal/filter2/columnindex/ColumnIndexStore.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.internal.filter2.columnindex; + +import org.apache.parquet.ParquetRuntimeException; +import org.apache.parquet.hadoop.metadata.ColumnPath; +import org.apache.parquet.internal.column.columnindex.ColumnIndex; +import org.apache.parquet.internal.column.columnindex.OffsetIndex; + +/** + * Provides the {@link ColumnIndex} and {@link OffsetIndex} objects for a row-group. + */ +public interface ColumnIndexStore { + + /** + * Exception thrown in case of an offset index is missing for any of the columns. + */ + public static class MissingOffsetIndexException extends ParquetRuntimeException { + public MissingOffsetIndexException(ColumnPath path) { + super("No offset index for column " + path.toDotString() + " is available; Unable to do filtering"); + } + } + + /** + * @param column + * the path of the column + * @return the column index for the column-chunk in the row-group or {@code null} if no column index is available + */ + ColumnIndex getColumnIndex(ColumnPath column); + + /** + * @param column + * the path of the column + * @return the offset index for the column-chunk in the row-group + * @throws MissingOffsetIndexException + * if the related offset index is missing + */ + OffsetIndex getOffsetIndex(ColumnPath column) throws MissingOffsetIndexException; +} diff --git a/parquet-column/src/main/java/org/apache/parquet/internal/filter2/columnindex/RowRanges.java b/parquet-column/src/main/java/org/apache/parquet/internal/filter2/columnindex/RowRanges.java new file mode 100644 index 0000000000..7753507900 --- /dev/null +++ b/parquet-column/src/main/java/org/apache/parquet/internal/filter2/columnindex/RowRanges.java @@ -0,0 +1,288 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.internal.filter2.columnindex; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; +import java.util.NoSuchElementException; +import java.util.PrimitiveIterator; +import java.util.Set; + +import org.apache.parquet.filter2.compat.FilterCompat.Filter; +import org.apache.parquet.internal.column.columnindex.OffsetIndex; + +/** + * Class representing row ranges in a row-group. These row ranges are calculated as a result of the column index based + * filtering. To be used iterate over the matching row indexes to be read from a row-group, retrieve the count of the + * matching rows or check overlapping of a row index range. + * + * @see ColumnIndexFilter#calculateRowRanges(Filter, ColumnIndexStore, Set, long) + */ +public class RowRanges { + private static class Range { + + // Returns the union of the two ranges or null if there are elements between them. + private static Range union(Range left, Range right) { + if (left.from <= right.from) { + if (left.to + 1 >= right.from) { + return new Range(left.from, Math.max(left.to, right.to)); + } + } else if (right.to + 1 >= left.from) { + return new Range(right.from, Math.max(left.to, right.to)); + } + return null; + } + + // Returns the intersection of the two ranges of null if they are not overlapped. + private static Range intersection(Range left, Range right) { + if (left.from <= right.from) { + if (left.to >= right.from) { + return new Range(right.from, Math.min(left.to, right.to)); + } + } else if (right.to >= left.from) { + return new Range(left.from, Math.min(left.to, right.to)); + } + return null; + } + + final long from; + final long to; + + // Creates a range of [from, to] (from and to are inclusive; empty ranges are not valid) + Range(long from, long to) { + assert from <= to; + this.from = from; + this.to = to; + } + + long count() { + return to - from + 1; + } + + boolean isBefore(Range other) { + return to < other.from; + } + + boolean isAfter(Range other) { + return from > other.to; + } + + @Override + public String toString() { + return "[" + from + ", " + to + ']'; + } + } + + static final RowRanges EMPTY = new RowRanges(); + + /* + * Creates a new RowRanges object with the single range [0, rowCount - 1]. + */ + static RowRanges createSingle(long rowCount) { + RowRanges ranges = new RowRanges(); + ranges.add(new Range(0, rowCount - 1)); + return ranges; + } + + /* + * Creates a new RowRanges object with the following ranges. + * [firstRowIndex[0], lastRowIndex[0]], + * [firstRowIndex[1], lastRowIndex[1]], + * ..., + * [firstRowIndex[n], lastRowIndex[n]] + * (See OffsetIndex.getFirstRowIndex and OffsetIndex.getLastRowIndex for details.) + * + * The union of the ranges are calculated so the result ranges always contain the disjunct ranges. See union for + * details. + */ + static RowRanges create(long rowCount, PrimitiveIterator.OfInt pageIndexes, OffsetIndex offsetIndex) { + RowRanges ranges = new RowRanges(); + while (pageIndexes.hasNext()) { + int pageIndex = pageIndexes.nextInt(); + ranges.add(new Range(offsetIndex.getFirstRowIndex(pageIndex), offsetIndex.getLastRowIndex(pageIndex, rowCount))); + } + return ranges; + } + + /* + * Calculates the union of the two specified RowRanges object. The union of two range is calculated if there are no + * elements between them. Otherwise, the two disjunct ranges are stored separately. + * For example: + * [113, 241] ∪ [221, 340] = [113, 330] + * [113, 230] ∪ [231, 340] = [113, 340] + * while + * [113, 230] ∪ [232, 340] = [113, 230], [232, 340] + * + * The result RowRanges object will contain all the row indexes that were contained in one of the specified objects. + */ + static RowRanges union(RowRanges left, RowRanges right) { + RowRanges result = new RowRanges(); + Iterator it1 = left.ranges.iterator(); + Iterator it2 = right.ranges.iterator(); + if (it2.hasNext()) { + Range range2 = it2.next(); + while (it1.hasNext()) { + Range range1 = it1.next(); + if (range1.isAfter(range2)) { + result.add(range2); + range2 = range1; + Iterator tmp = it1; + it1 = it2; + it2 = tmp; + } else { + result.add(range1); + } + } + result.add(range2); + } else { + it2 = it1; + } + while (it2.hasNext()) { + result.add(it2.next()); + } + + return result; + } + + /* + * Calculates the intersection of the two specified RowRanges object. Two ranges intersect if they have common + * elements otherwise the result is empty. + * For example: + * [113, 241] ∩ [221, 340] = [221, 241] + * while + * [113, 230] ∩ [231, 340] = + * + * The result RowRanges object will contain all the row indexes there were contained in both of the specified objects + */ + static RowRanges intersection(RowRanges left, RowRanges right) { + RowRanges result = new RowRanges(); + + int rightIndex = 0; + for (Range l : left.ranges) { + for (int i = rightIndex, n = right.ranges.size(); i < n; ++i) { + Range r = right.ranges.get(i); + if (l.isBefore(r)) { + break; + } else if (l.isAfter(r)) { + rightIndex = i + 1; + continue; + } + result.add(Range.intersection(l, r)); + } + } + + return result; + } + + private final List ranges = new ArrayList<>(); + + private RowRanges() { + } + + /* + * Adds a range to the end of the list of ranges. It maintains the disjunct ascending order(*) of the ranges by + * trying to union the specified range to the last ranges in the list. The specified range shall be larger(*) than + * the last one or might be overlapped with some of the last ones. + * (*) [a, b] < [c, d] if b < c + */ + private void add(Range range) { + Range rangeToAdd = range; + for (int i = ranges.size() - 1; i >= 0; --i) { + Range last = ranges.get(i); + assert !last.isAfter(range); + Range u = Range.union(last, rangeToAdd); + if (u == null) { + break; + } + rangeToAdd = u; + ranges.remove(i); + } + ranges.add(rangeToAdd); + } + + /** + * @return the number of rows in the ranges + */ + public long rowCount() { + long cnt = 0; + for (Range range : ranges) { + cnt += range.count(); + } + return cnt; + } + + /** + * @return the ascending iterator of the row indexes contained in the ranges + */ + public PrimitiveIterator.OfLong iterator() { + return new PrimitiveIterator.OfLong() { + private int currentRangeIndex = -1; + private Range currentRange; + private long next = findNext(); + + private long findNext() { + if (currentRange == null || next + 1 > currentRange.to) { + if (currentRangeIndex + 1 < ranges.size()) { + currentRange = ranges.get(++currentRangeIndex); + next = currentRange.from; + } else { + return -1; + } + } else { + ++next; + } + return next; + } + + @Override + public boolean hasNext() { + return next >= 0; + } + + @Override + public long nextLong() { + long ret = next; + if (ret < 0) { + throw new NoSuchElementException(); + } + next = findNext(); + return ret; + } + }; + } + + /** + * @param from + * the first row of the range to be checked for connection + * @param to + * the last row of the range to be checked for connection + * @return {@code true} if the specified range is overlapping (have common elements) with one of the ranges + */ + public boolean isOverlapping(long from, long to) { + return Collections.binarySearch(ranges, new Range(from, to), + (r1, r2) -> r1.isBefore(r2) ? -1 : r1.isAfter(r2) ? 1 : 0) >= 0; + } + + @Override + public String toString() { + return ranges.toString(); + } +} diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/ConversionPatterns.java b/parquet-column/src/main/java/org/apache/parquet/schema/ConversionPatterns.java index 6db1e587c9..a530db13c8 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/ConversionPatterns.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/ConversionPatterns.java @@ -1,4 +1,4 @@ -/* +/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -6,9 +6,9 @@ * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -22,7 +22,7 @@ import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; import org.apache.parquet.schema.Type.Repetition; -import static org.apache.parquet.schema.OriginalType.*; +import static org.apache.parquet.schema.LogicalTypeAnnotation.stringType; /** * Utility functions to convert from Java-like map and list types @@ -37,15 +37,15 @@ public abstract class ConversionPatterns { * * @param repetition repetition for the list or map * @param alias name of the field - * @param originalType original type for the list or map + * @param logicalTypeAnnotation logical type for the list or map * @param nested the nested repeated field * @return a group type */ - private static GroupType listWrapper(Repetition repetition, String alias, OriginalType originalType, Type nested) { + private static GroupType listWrapper(Repetition repetition, String alias, LogicalTypeAnnotation logicalTypeAnnotation, Type nested) { if (!nested.isRepetition(Repetition.REPEATED)) { throw new IllegalArgumentException("Nested type should be repeated: " + nested); } - return new GroupType(repetition, alias, originalType, nested); + return new GroupType(repetition, alias, logicalTypeAnnotation, nested); } public static GroupType mapType(Repetition repetition, String alias, Type keyType, Type valueType) { @@ -53,7 +53,7 @@ public static GroupType mapType(Repetition repetition, String alias, Type keyTyp } public static GroupType stringKeyMapType(Repetition repetition, String alias, String mapAlias, Type valueType) { - return mapType(repetition, alias, mapAlias, new PrimitiveType(Repetition.REQUIRED, PrimitiveTypeName.BINARY, "key", OriginalType.UTF8), valueType); + return mapType(repetition, alias, mapAlias, new PrimitiveType(Repetition.REQUIRED, PrimitiveTypeName.BINARY, "key", stringType()), valueType); } public static GroupType stringKeyMapType(Repetition repetition, String alias, Type valueType) { @@ -66,11 +66,11 @@ public static GroupType mapType(Repetition repetition, String alias, String mapA return listWrapper( repetition, alias, - MAP, + LogicalTypeAnnotation.mapType(), new GroupType( Repetition.REPEATED, mapAlias, - MAP_KEY_VALUE, + LogicalTypeAnnotation.MapKeyValueTypeAnnotation.getInstance(), keyType) ); } else { @@ -80,11 +80,11 @@ public static GroupType mapType(Repetition repetition, String alias, String mapA return listWrapper( repetition, alias, - MAP, + LogicalTypeAnnotation.mapType(), new GroupType( Repetition.REPEATED, mapAlias, - MAP_KEY_VALUE, + LogicalTypeAnnotation.MapKeyValueTypeAnnotation.getInstance(), keyType, valueType) ); @@ -103,7 +103,7 @@ public static GroupType listType(Repetition repetition, String alias, Type neste return listWrapper( repetition, alias, - LIST, + LogicalTypeAnnotation.listType(), nestedType ); } @@ -125,7 +125,7 @@ public static GroupType listOfElements(Repetition listRepetition, String name, T return listWrapper( listRepetition, name, - LIST, + LogicalTypeAnnotation.listType(), new GroupType(Repetition.REPEATED, "list", elementType) ); } diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/GroupType.java b/parquet-column/src/main/java/org/apache/parquet/schema/GroupType.java index 5cb40e5e39..64e7062959 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/GroupType.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/GroupType.java @@ -67,6 +67,16 @@ public GroupType(Repetition repetition, String name, OriginalType originalType, this(repetition, name, originalType, Arrays.asList(fields)); } + /** + * @param repetition OPTIONAL, REPEATED, REQUIRED + * @param name the name of the field + * @param logicalTypeAnnotation (optional) the logical type to help with cross schema conversion (LIST, MAP, ...) + * @param fields the contained fields + */ + GroupType(Repetition repetition, String name, LogicalTypeAnnotation logicalTypeAnnotation, Type... fields) { + this(repetition, name, logicalTypeAnnotation, Arrays.asList(fields)); + } + /** * @param repetition OPTIONAL, REPEATED, REQUIRED * @param name the name of the field @@ -78,6 +88,16 @@ public GroupType(Repetition repetition, String name, OriginalType originalType, this(repetition, name, originalType, fields, null); } + /** + * @param repetition OPTIONAL, REPEATED, REQUIRED + * @param name the name of the field + * @param logicalTypeAnnotation (optional) the logical type to help with cross schema conversion (LIST, MAP, ...) + * @param fields the contained fields + */ + GroupType(Repetition repetition, String name, LogicalTypeAnnotation logicalTypeAnnotation, List fields) { + this(repetition, name, logicalTypeAnnotation, fields, null); + } + /** * @param repetition OPTIONAL, REPEATED, REQUIRED * @param name the name of the field @@ -109,7 +129,7 @@ public GroupType(Repetition repetition, String name, OriginalType originalType, */ @Override public GroupType withId(int id) { - return new GroupType(getRepetition(), getName(), getOriginalType(), fields, new ID(id)); + return new GroupType(getRepetition(), getName(), getLogicalTypeAnnotation(), fields, new ID(id)); } /** @@ -117,7 +137,7 @@ public GroupType withId(int id) { * @return a group with the same attributes and new fields. */ public GroupType withNewFields(List newFields) { - return new GroupType(getRepetition(), getName(), getOriginalType(), newFields, getId()); + return new GroupType(getRepetition(), getName(), getLogicalTypeAnnotation(), newFields, getId()); } /** @@ -219,7 +239,7 @@ public void writeToStringBuilder(StringBuilder sb, String indent) { .append(getRepetition().name().toLowerCase(Locale.ENGLISH)) .append(" group ") .append(getName()) - .append(getOriginalType() == null ? "" : " (" + getOriginalType() +")") + .append(getLogicalTypeAnnotation() == null ? "" : " (" + getLogicalTypeAnnotation().toString() +")") .append(getId() == null ? "" : " = " + getId()) .append(" {\n"); membersDisplayString(sb, indent + " "); @@ -250,7 +270,7 @@ protected boolean typeEquals(Type other) { */ @Override public int hashCode() { - return Objects.hash(getOriginalType(), getFields()); + return Objects.hash(getLogicalTypeAnnotation(), getFields()); } /** @@ -261,7 +281,7 @@ protected boolean equals(Type otherType) { return !otherType.isPrimitive() && super.equals(otherType) - && getOriginalType() == otherType.getOriginalType() + && Objects.equals(getLogicalTypeAnnotation(),otherType.getLogicalTypeAnnotation()) && getFields().equals(otherType.asGroupType().getFields()); } @@ -355,7 +375,7 @@ protected Type union(Type toMerge, boolean strict) { if (toMerge.isPrimitive()) { throw new IncompatibleSchemaModificationException("can not merge primitive type " + toMerge + " into group type " + this); } - return new GroupType(toMerge.getRepetition(), getName(), toMerge.getOriginalType(), mergeFields(toMerge.asGroupType()), getId()); + return new GroupType(toMerge.getRepetition(), getName(), toMerge.getLogicalTypeAnnotation(), mergeFields(toMerge.asGroupType()), getId()); } /** @@ -383,8 +403,8 @@ List mergeFields(GroupType toMerge, boolean strict) { if (fieldToMerge.getRepetition().isMoreRestrictiveThan(type.getRepetition())) { throw new IncompatibleSchemaModificationException("repetition constraint is more restrictive: can not merge type " + fieldToMerge + " into " + type); } - if (type.getOriginalType() != null && fieldToMerge.getOriginalType() != type.getOriginalType()) { - throw new IncompatibleSchemaModificationException("cannot merge original type " + fieldToMerge.getOriginalType() + " into " + type.getOriginalType()); + if (type.getLogicalTypeAnnotation() != null && !type.getLogicalTypeAnnotation().equals(fieldToMerge.getLogicalTypeAnnotation())) { + throw new IncompatibleSchemaModificationException("cannot merge logical type " + fieldToMerge.getLogicalTypeAnnotation() + " into " + type.getLogicalTypeAnnotation()); } merged = type.union(fieldToMerge, strict); } else { diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/LogicalTypeAnnotation.java b/parquet-column/src/main/java/org/apache/parquet/schema/LogicalTypeAnnotation.java index 340a24af16..c1b7d99fd8 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/LogicalTypeAnnotation.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/LogicalTypeAnnotation.java @@ -20,12 +20,28 @@ import org.apache.parquet.Preconditions; +import java.util.Collections; +import java.util.HashSet; import java.util.List; import java.util.Objects; import java.util.Optional; +import java.util.Set; import java.util.function.Supplier; +import static java.util.Arrays.asList; import static java.util.Optional.empty; +import static org.apache.parquet.schema.ColumnOrder.ColumnOrderName.TYPE_DEFINED_ORDER; +import static org.apache.parquet.schema.ColumnOrder.ColumnOrderName.UNDEFINED; +import static org.apache.parquet.schema.PrimitiveStringifier.TIMESTAMP_MICROS_STRINGIFIER; +import static org.apache.parquet.schema.PrimitiveStringifier.TIMESTAMP_MICROS_UTC_STRINGIFIER; +import static org.apache.parquet.schema.PrimitiveStringifier.TIMESTAMP_MILLIS_STRINGIFIER; +import static org.apache.parquet.schema.PrimitiveStringifier.TIMESTAMP_MILLIS_UTC_STRINGIFIER; +import static org.apache.parquet.schema.PrimitiveStringifier.TIMESTAMP_NANOS_STRINGIFIER; +import static org.apache.parquet.schema.PrimitiveStringifier.TIMESTAMP_NANOS_UTC_STRINGIFIER; +import static org.apache.parquet.schema.PrimitiveStringifier.TIME_NANOS_STRINGIFIER; +import static org.apache.parquet.schema.PrimitiveStringifier.TIME_NANOS_UTC_STRINGIFIER; +import static org.apache.parquet.schema.PrimitiveStringifier.TIME_STRINGIFIER; +import static org.apache.parquet.schema.PrimitiveStringifier.TIME_UTC_STRINGIFIER; public abstract class LogicalTypeAnnotation { enum LogicalTypeToken { @@ -144,6 +160,10 @@ String typeParametersAsString() { return ""; } + boolean isValidColumnOrder(ColumnOrder columnOrder) { + return columnOrder.getColumnOrderName() == UNDEFINED || columnOrder.getColumnOrderName() == TYPE_DEFINED_ORDER; + } + @Override public String toString() { StringBuilder sb = new StringBuilder(); @@ -152,6 +172,10 @@ public String toString() { return sb.toString(); } + PrimitiveStringifier valueStringifier(PrimitiveType primitiveType) { + throw new UnsupportedOperationException("Stringifier is not supported for the logical type: " + this); + } + /** * Helper method to convert the old representation of logical types (OriginalType) to new logical type. */ @@ -290,6 +314,11 @@ public int hashCode() { // This type doesn't have any parameters, thus using class hashcode return getClass().hashCode(); } + + @Override + PrimitiveStringifier valueStringifier(PrimitiveType primitiveType) { + return PrimitiveStringifier.UTF8_STRINGIFIER; + } } public static class MapLogicalTypeAnnotation extends LogicalTypeAnnotation { @@ -389,15 +418,22 @@ public int hashCode() { // This type doesn't have any parameters, thus using class hashcode return getClass().hashCode(); } + + @Override + PrimitiveStringifier valueStringifier(PrimitiveType primitiveType) { + return PrimitiveStringifier.UTF8_STRINGIFIER; + } } public static class DecimalLogicalTypeAnnotation extends LogicalTypeAnnotation { + private final PrimitiveStringifier stringifier; private final int scale; private final int precision; private DecimalLogicalTypeAnnotation(int scale, int precision) { this.scale = scale; this.precision = precision; + stringifier = PrimitiveStringifier.createDecimalStringifier(scale); } public int getPrecision() { @@ -447,6 +483,11 @@ public boolean equals(Object obj) { public int hashCode() { return Objects.hash(scale, precision); } + + @Override + PrimitiveStringifier valueStringifier(PrimitiveType primitiveType) { + return stringifier; + } } public static class DateLogicalTypeAnnotation extends LogicalTypeAnnotation { @@ -480,11 +521,17 @@ public int hashCode() { // This type doesn't have any parameters, thus using class hashcode return getClass().hashCode(); } + + @Override + PrimitiveStringifier valueStringifier(PrimitiveType primitiveType) { + return PrimitiveStringifier.DATE_STRINGIFIER; + } } public enum TimeUnit { MILLIS, - MICROS + MICROS, + NANOS } public static class TimeLogicalTypeAnnotation extends LogicalTypeAnnotation { @@ -504,7 +551,7 @@ public OriginalType toOriginalType() { case MICROS: return OriginalType.TIME_MICROS; default: - throw new RuntimeException("Unknown original type for " + unit); + return null; } } @@ -550,6 +597,19 @@ public boolean equals(Object obj) { public int hashCode() { return Objects.hash(isAdjustedToUTC, unit); } + + @Override + PrimitiveStringifier valueStringifier(PrimitiveType primitiveType) { + switch (unit) { + case MICROS: + case MILLIS: + return isAdjustedToUTC ? TIME_UTC_STRINGIFIER : TIME_STRINGIFIER; + case NANOS: + return isAdjustedToUTC ? TIME_NANOS_UTC_STRINGIFIER : TIME_NANOS_STRINGIFIER; + default: + return super.valueStringifier(primitiveType); + } + } } public static class TimestampLogicalTypeAnnotation extends LogicalTypeAnnotation { @@ -569,7 +629,7 @@ public OriginalType toOriginalType() { case MICROS: return OriginalType.TIMESTAMP_MICROS; default: - throw new RuntimeException("Unknown original type for " + unit); + return null; } } @@ -615,14 +675,33 @@ public boolean equals(Object obj) { public int hashCode() { return Objects.hash(isAdjustedToUTC, unit); } + + @Override + PrimitiveStringifier valueStringifier(PrimitiveType primitiveType) { + switch (unit) { + case MICROS: + return isAdjustedToUTC ? TIMESTAMP_MICROS_UTC_STRINGIFIER : TIMESTAMP_MICROS_STRINGIFIER; + case MILLIS: + return isAdjustedToUTC ? TIMESTAMP_MILLIS_UTC_STRINGIFIER : TIMESTAMP_MILLIS_STRINGIFIER; + case NANOS: + return isAdjustedToUTC ? TIMESTAMP_NANOS_UTC_STRINGIFIER : TIMESTAMP_NANOS_STRINGIFIER; + default: + return super.valueStringifier(primitiveType); + } + } } public static class IntLogicalTypeAnnotation extends LogicalTypeAnnotation { + private static final Set VALID_BIT_WIDTH = Collections.unmodifiableSet( + new HashSet<>(asList(8, 16, 32, 64))); + private final int bitWidth; private final boolean isSigned; - private IntLogicalTypeAnnotation(int bitWidth, boolean isSigned) { + if (!VALID_BIT_WIDTH.contains(bitWidth)) { + throw new IllegalArgumentException("Invalid integer bit width: " + bitWidth); + } this.bitWidth = bitWidth; this.isSigned = isSigned; } @@ -639,7 +718,7 @@ public OriginalType toOriginalType() { case 64: return isSigned ? OriginalType.INT_64 : OriginalType.UINT_64; default: - throw new RuntimeException("Unknown original type " + toOriginalType()); + return null; } } @@ -685,6 +764,11 @@ public boolean equals(Object obj) { public int hashCode() { return Objects.hash(bitWidth, isSigned); } + + @Override + PrimitiveStringifier valueStringifier(PrimitiveType primitiveType) { + return isSigned ? PrimitiveStringifier.DEFAULT_STRINGIFIER : PrimitiveStringifier.UNSIGNED_STRINGIFIER; + } } public static class JsonLogicalTypeAnnotation extends LogicalTypeAnnotation { @@ -718,6 +802,11 @@ public int hashCode() { // This type doesn't have any parameters, thus using class hashcode return getClass().hashCode(); } + + @Override + PrimitiveStringifier valueStringifier(PrimitiveType primitiveType) { + return PrimitiveStringifier.UTF8_STRINGIFIER; + } } public static class BsonLogicalTypeAnnotation extends LogicalTypeAnnotation { @@ -751,6 +840,11 @@ public int hashCode() { // This type doesn't have any parameters, thus using class hashcode return getClass().hashCode(); } + + @Override + PrimitiveStringifier valueStringifier(PrimitiveType primitiveType) { + return PrimitiveStringifier.DEFAULT_STRINGIFIER; + } } // This logical type annotation is implemented to support backward compatibility with ConvertedType. @@ -791,6 +885,16 @@ public int hashCode() { // This type doesn't have any parameters, thus using class hashcode return getClass().hashCode(); } + + @Override + PrimitiveStringifier valueStringifier(PrimitiveType primitiveType) { + return PrimitiveStringifier.INTERVAL_STRINGIFIER; + } + + @Override + boolean isValidColumnOrder(ColumnOrder columnOrder) { + return columnOrder.getColumnOrderName() == UNDEFINED; + } } // This logical type annotation is implemented to support backward compatibility with ConvertedType. @@ -845,55 +949,55 @@ public int hashCode() { * or {@link Optional#orElseThrow(Supplier)} to throw exception if omitting a type is not allowed. */ public interface LogicalTypeAnnotationVisitor { - default Optional visit(StringLogicalTypeAnnotation logicalTypeAnnotation) { + default Optional visit(StringLogicalTypeAnnotation stringLogicalType) { return empty(); } - default Optional visit(MapLogicalTypeAnnotation logicalTypeAnnotation) { + default Optional visit(MapLogicalTypeAnnotation mapLogicalType) { return empty(); } - default Optional visit(ListLogicalTypeAnnotation logicalTypeAnnotation) { + default Optional visit(ListLogicalTypeAnnotation listLogicalType) { return empty(); } - default Optional visit(EnumLogicalTypeAnnotation logicalTypeAnnotation) { + default Optional visit(EnumLogicalTypeAnnotation enumLogicalType) { return empty(); } - default Optional visit(DecimalLogicalTypeAnnotation logicalTypeAnnotation) { + default Optional visit(DecimalLogicalTypeAnnotation decimalLogicalType) { return empty(); } - default Optional visit(DateLogicalTypeAnnotation logicalTypeAnnotation) { + default Optional visit(DateLogicalTypeAnnotation dateLogicalType) { return empty(); } - default Optional visit(TimeLogicalTypeAnnotation logicalTypeAnnotation) { + default Optional visit(TimeLogicalTypeAnnotation timeLogicalType) { return empty(); } - default Optional visit(TimestampLogicalTypeAnnotation logicalTypeAnnotation) { + default Optional visit(TimestampLogicalTypeAnnotation timestampLogicalType) { return empty(); } - default Optional visit(IntLogicalTypeAnnotation logicalTypeAnnotation) { + default Optional visit(IntLogicalTypeAnnotation intLogicalType) { return empty(); } - default Optional visit(JsonLogicalTypeAnnotation logicalTypeAnnotation) { + default Optional visit(JsonLogicalTypeAnnotation jsonLogicalType) { return empty(); } - default Optional visit(BsonLogicalTypeAnnotation logicalTypeAnnotation) { + default Optional visit(BsonLogicalTypeAnnotation bsonLogicalType) { return empty(); } - default Optional visit(IntervalLogicalTypeAnnotation logicalTypeAnnotation) { + default Optional visit(IntervalLogicalTypeAnnotation intervalLogicalType) { return empty(); } - default Optional visit(MapKeyValueTypeAnnotation logicalTypeAnnotation) { + default Optional visit(MapKeyValueTypeAnnotation mapKeyValueLogicalType) { return empty(); } } diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/MessageType.java b/parquet-column/src/main/java/org/apache/parquet/schema/MessageType.java index d305eb88ee..83f98d7ecc 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/MessageType.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/MessageType.java @@ -1,4 +1,4 @@ -/* +/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -6,9 +6,9 @@ * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -62,7 +62,7 @@ public void accept(TypeVisitor visitor) { public void writeToStringBuilder(StringBuilder sb, String indent) { sb.append("message ") .append(getName()) - .append(getOriginalType() == null ? "" : " (" + getOriginalType() +")") + .append(getLogicalTypeAnnotation() == null ? "" : " (" + getLogicalTypeAnnotation().toString() +")") .append(" {\n"); membersDisplayString(sb, " "); sb.append("}\n"); diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/OriginalType.java b/parquet-column/src/main/java/org/apache/parquet/schema/OriginalType.java index b00ae7e6ce..78421b33fb 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/OriginalType.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/OriginalType.java @@ -1,4 +1,4 @@ -/* +/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -6,9 +6,9 @@ * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -21,46 +21,24 @@ public enum OriginalType { MAP, LIST, - UTF8(PrimitiveStringifier.UTF8_STRINGIFIER), + UTF8, MAP_KEY_VALUE, - ENUM(PrimitiveStringifier.UTF8_STRINGIFIER), - DECIMAL { - @Override - PrimitiveStringifier stringifier(PrimitiveType type) { - return PrimitiveStringifier.createDecimalStringifier(type.getDecimalMetadata().getScale()); - } - }, - DATE(PrimitiveStringifier.DATE_STRINGIFIER), - TIME_MILLIS(PrimitiveStringifier.TIME_STRINGIFIER), - TIME_MICROS(PrimitiveStringifier.TIME_STRINGIFIER), - TIMESTAMP_MILLIS(PrimitiveStringifier.TIMESTAMP_MILLIS_STRINGIFIER), - TIMESTAMP_MICROS(PrimitiveStringifier.TIMESTAMP_MICROS_STRINGIFIER), - UINT_8(PrimitiveStringifier.UNSIGNED_STRINGIFIER), - UINT_16(PrimitiveStringifier.UNSIGNED_STRINGIFIER), - UINT_32(PrimitiveStringifier.UNSIGNED_STRINGIFIER), - UINT_64(PrimitiveStringifier.UNSIGNED_STRINGIFIER), - INT_8(PrimitiveStringifier.DEFAULT_STRINGIFIER), - INT_16(PrimitiveStringifier.DEFAULT_STRINGIFIER), - INT_32(PrimitiveStringifier.DEFAULT_STRINGIFIER), - INT_64(PrimitiveStringifier.DEFAULT_STRINGIFIER), - JSON(PrimitiveStringifier.UTF8_STRINGIFIER), - BSON(PrimitiveStringifier.DEFAULT_STRINGIFIER), - INTERVAL(PrimitiveStringifier.INTERVAL_STRINGIFIER); - - private final PrimitiveStringifier stringifier; - - PrimitiveStringifier stringifier(PrimitiveType type) { - if (stringifier == null) { - throw new UnsupportedOperationException("Stringifier is not supported for the original type: " + this); - } - return stringifier; - } - - OriginalType() { - this(null); - } - - OriginalType(PrimitiveStringifier stringifier) { - this.stringifier = stringifier; - } + ENUM, + DECIMAL, + DATE, + TIME_MILLIS, + TIME_MICROS, + TIMESTAMP_MILLIS, + TIMESTAMP_MICROS, + UINT_8, + UINT_16, + UINT_32, + UINT_64, + INT_8, + INT_16, + INT_32, + INT_64, + JSON, + BSON, + INTERVAL } diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveComparator.java b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveComparator.java index 5e9adbcf7b..d343b0ea4c 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveComparator.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveComparator.java @@ -238,8 +238,8 @@ int compare(ByteBuffer b1, ByteBuffer b2) { int p1 = b1.position(); int p2 = b2.position(); - boolean isNegative1 = l1 > 0 ? b1.get(p1) < 0 : false; - boolean isNegative2 = l2 > 0 ? b2.get(p2) < 0 : false; + boolean isNegative1 = l1 > 0 && b1.get(p1) < 0; + boolean isNegative2 = l2 > 0 && b2.get(p2) < 0; if (isNegative1 != isNegative2) { return isNegative1 ? -1 : 1; } @@ -259,7 +259,7 @@ int compare(ByteBuffer b1, ByteBuffer b2) { // The beginning of the longer buffer equals to the padding or the lengths are equal if (result == 0) { - result = compare(l1, b1, p1, b2, p2); + result = compare(Math.min(l1, l2), b1, p1, b2, p2); } return result; } diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveStringifier.java b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveStringifier.java index c1a9b582fe..4705ad94eb 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveStringifier.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveStringifier.java @@ -22,14 +22,16 @@ import static java.util.concurrent.TimeUnit.MICROSECONDS; import static java.util.concurrent.TimeUnit.MILLISECONDS; import static java.util.concurrent.TimeUnit.MINUTES; +import static java.util.concurrent.TimeUnit.NANOSECONDS; import static java.util.concurrent.TimeUnit.SECONDS; import java.math.BigDecimal; import java.math.BigInteger; import java.nio.ByteBuffer; import java.nio.ByteOrder; -import java.text.SimpleDateFormat; -import java.util.TimeZone; +import java.time.Instant; +import java.time.ZoneOffset; +import java.time.format.DateTimeFormatter; import java.util.concurrent.TimeUnit; import javax.naming.OperationNotSupportedException; @@ -242,71 +244,120 @@ String stringifyNotNull(Binary value) { }; private static class DateStringifier extends PrimitiveStringifier { - private final SimpleDateFormat formatter; - private static final TimeZone UTC = TimeZone.getTimeZone("utc"); + private final DateTimeFormatter formatter; private DateStringifier(String name, String format) { super(name); - formatter = new SimpleDateFormat(format); - formatter.setTimeZone(UTC); + formatter = DateTimeFormatter.ofPattern(format).withZone(ZoneOffset.UTC); } @Override public String stringify(int value) { - return toFormattedString(toMillis(value)); + return toFormattedString(getInstant(value)); } @Override public String stringify(long value) { - return toFormattedString(toMillis(value)); + return toFormattedString(getInstant(value)); } - private String toFormattedString(long millis) { - return formatter.format(millis); + private String toFormattedString(Instant instant) { + return formatter.format(instant); } - long toMillis(int value) { + Instant getInstant(int value) { // throw the related unsupported exception super.stringify(value); - return 0; + return null; } - long toMillis(long value) { + Instant getInstant(long value) { // throw the related unsupported exception super.stringify(value); - return 0; + return null; } } static final PrimitiveStringifier DATE_STRINGIFIER = new DateStringifier("DATE_STRINGIFIER", "yyyy-MM-dd") { @Override - long toMillis(int value) { - return TimeUnit.DAYS.toMillis(value); + Instant getInstant(int value) { + return Instant.ofEpochMilli(TimeUnit.DAYS.toMillis(value)); }; }; static final PrimitiveStringifier TIMESTAMP_MILLIS_STRINGIFIER = new DateStringifier( "TIMESTAMP_MILLIS_STRINGIFIER", "yyyy-MM-dd'T'HH:mm:ss.SSS") { @Override - long toMillis(long value) { - return value; + Instant getInstant(long value) { + return Instant.ofEpochMilli(value); } }; static final PrimitiveStringifier TIMESTAMP_MICROS_STRINGIFIER = new DateStringifier( - "TIMESTAMP_MICROS_STRINGIFIER", "yyyy-MM-dd'T'HH:mm:ss.SSS") { + "TIMESTAMP_MICROS_STRINGIFIER", "yyyy-MM-dd'T'HH:mm:ss.SSSSSS") { @Override - public String stringify(long value) { - return super.stringify(value) + String.format("%03d", Math.abs(value % 1000)); + Instant getInstant(long value) { + return Instant.ofEpochSecond(MICROSECONDS.toSeconds(value), MICROSECONDS.toNanos(value % SECONDS.toMicros(1))); + } + }; + + static final PrimitiveStringifier TIMESTAMP_NANOS_STRINGIFIER = new DateStringifier( + "TIMESTAMP_NANOS_STRINGIFIER", "yyyy-MM-dd'T'HH:mm:ss.SSSSSSSSS") { + @Override + Instant getInstant(long value) { + return Instant.ofEpochSecond(NANOSECONDS.toSeconds(value), NANOSECONDS.toNanos(value % SECONDS.toNanos(1))); } + }; + static final PrimitiveStringifier TIMESTAMP_MILLIS_UTC_STRINGIFIER = new DateStringifier( + "TIMESTAMP_MILLIS_UTC_STRINGIFIER", "yyyy-MM-dd'T'HH:mm:ss.SSSZ") { @Override - long toMillis(long value) { - return value / 1000; + Instant getInstant(long value) { + return Instant.ofEpochMilli(value); } }; - static final PrimitiveStringifier TIME_STRINGIFIER = new PrimitiveStringifier("TIME_STRINGIFIER") { + static final PrimitiveStringifier TIMESTAMP_MICROS_UTC_STRINGIFIER = new DateStringifier( + "TIMESTAMP_MICROS_UTC_STRINGIFIER", "yyyy-MM-dd'T'HH:mm:ss.SSSSSSZ") { + @Override + Instant getInstant(long value) { + return Instant.ofEpochSecond(MICROSECONDS.toSeconds(value), MICROSECONDS.toNanos(value % SECONDS.toMicros(1))); + } + }; + + static final PrimitiveStringifier TIMESTAMP_NANOS_UTC_STRINGIFIER = new DateStringifier( + "TIMESTAMP_NANOS_UTC_STRINGIFIER", "yyyy-MM-dd'T'HH:mm:ss.SSSSSSSSSZ") { + @Override + Instant getInstant(long value) { + return Instant.ofEpochSecond(NANOSECONDS.toSeconds(value), NANOSECONDS.toNanos(value % SECONDS.toNanos(1))); + } + }; + + private abstract static class TimeStringifier extends PrimitiveStringifier { + private final boolean withZone; + + TimeStringifier(String name, boolean withZone) { + super(name); + this.withZone = withZone; + } + + protected String toTimeString(long duration, TimeUnit unit) { + String additionalFormat = (unit == MILLISECONDS ? "3d" : unit == MICROSECONDS ? "6d" : "9d"); + String timeZone = withZone ? "+0000" : ""; + String format = "%02d:%02d:%02d.%0" + additionalFormat + timeZone; + return String.format(format, + unit.toHours(duration), + convert(duration, unit, MINUTES, HOURS), + convert(duration, unit, SECONDS, MINUTES), + convert(duration, unit, unit, SECONDS)); + } + + protected long convert(long duration, TimeUnit from, TimeUnit to, TimeUnit higher) { + return Math.abs(to.convert(duration, from) % to.convert(1, higher)); + } + } + + static final PrimitiveStringifier TIME_STRINGIFIER = new TimeStringifier("TIME_STRINGIFIER", false) { @Override public String stringify(int millis) { return toTimeString(millis, MILLISECONDS); @@ -316,18 +367,31 @@ public String stringify(int millis) { public String stringify(long micros) { return toTimeString(micros, MICROSECONDS); } + }; - private String toTimeString(long duration, TimeUnit unit) { - String format = "%02d:%02d:%02d.%0" + (unit == MILLISECONDS ? "3d" : "6d"); - return String.format(format, - unit.toHours(duration), - convert(duration, unit, MINUTES, HOURS), - convert(duration, unit, SECONDS, MINUTES), - convert(duration, unit, unit, SECONDS)); + static final PrimitiveStringifier TIME_NANOS_STRINGIFIER = new TimeStringifier("TIME_NANOS_STRINGIFIER", false) { + @Override + public String stringify(long nanos) { + return toTimeString(nanos, NANOSECONDS); } + }; - private long convert(long duration, TimeUnit from, TimeUnit to, TimeUnit higher) { - return Math.abs(to.convert(duration, from) % to.convert(1, higher)); + static final PrimitiveStringifier TIME_UTC_STRINGIFIER = new TimeStringifier("TIME_UTC_STRINGIFIER", true) { + @Override + public String stringify(int millis) { + return toTimeString(millis, MILLISECONDS); + } + + @Override + public String stringify(long micros) { + return toTimeString(micros, MICROSECONDS); + } + }; + + static final PrimitiveStringifier TIME_NANOS_UTC_STRINGIFIER = new TimeStringifier("TIME_NANOS_UTC_STRINGIFIER", true) { + @Override + public String stringify(long nanos) { + return toTimeString(nanos, NANOSECONDS); } }; diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveType.java b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveType.java index 08adfbe996..6a7382eaba 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveType.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveType.java @@ -21,6 +21,8 @@ import java.util.Arrays; import java.util.List; import java.util.Locale; +import java.util.Objects; +import java.util.Optional; import org.apache.parquet.Preconditions; import org.apache.parquet.ShouldNeverHappenException; @@ -31,6 +33,11 @@ import org.apache.parquet.io.api.RecordConsumer; import org.apache.parquet.schema.ColumnOrder.ColumnOrderName; +import static java.util.Optional.empty; +import static java.util.Optional.of; +import static org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit.MICROS; +import static org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit.MILLIS; + /** * Representation of a Primitive type @@ -85,23 +92,32 @@ public T convert(PrimitiveTypeNameConverter conve } @Override - PrimitiveComparator comparator(OriginalType logicalType) { + PrimitiveComparator comparator(LogicalTypeAnnotation logicalType) { if (logicalType == null) { return PrimitiveComparator.SIGNED_INT64_COMPARATOR; } - switch (logicalType) { - case UINT_64: - return PrimitiveComparator.UNSIGNED_INT64_COMPARATOR; - case INT_64: - case DECIMAL: - case TIME_MICROS: - case TIMESTAMP_MILLIS: - case TIMESTAMP_MICROS: - return PrimitiveComparator.SIGNED_INT64_COMPARATOR; - default: - throw new ShouldNeverHappenException( - "No comparator logic implemented for INT64 logical type: " + logicalType); - } + return logicalType.accept(new LogicalTypeAnnotation.LogicalTypeAnnotationVisitor() { + @Override + public Optional visit(LogicalTypeAnnotation.IntLogicalTypeAnnotation intLogicalType) { + return intLogicalType.isSigned() ? + of(PrimitiveComparator.SIGNED_INT64_COMPARATOR) : of(PrimitiveComparator.UNSIGNED_INT64_COMPARATOR); + } + + @Override + public Optional visit(LogicalTypeAnnotation.DecimalLogicalTypeAnnotation decimalLogicalType) { + return of(PrimitiveComparator.SIGNED_INT64_COMPARATOR); + } + + @Override + public Optional visit(LogicalTypeAnnotation.TimeLogicalTypeAnnotation timeLogicalType) { + return of(PrimitiveComparator.SIGNED_INT64_COMPARATOR); + } + + @Override + public Optional visit(LogicalTypeAnnotation.TimestampLogicalTypeAnnotation timestampLogicalType) { + return of(PrimitiveComparator.SIGNED_INT64_COMPARATOR); + } + }).orElseThrow(() -> new ShouldNeverHappenException("No comparator logic implemented for INT64 logical type: " + logicalType)); } }, INT32("getInteger", Integer.TYPE) { @@ -128,26 +144,39 @@ public T convert(PrimitiveTypeNameConverter conve } @Override - PrimitiveComparator comparator(OriginalType logicalType) { + PrimitiveComparator comparator(LogicalTypeAnnotation logicalType) { if (logicalType == null) { return PrimitiveComparator.SIGNED_INT32_COMPARATOR; } - switch (logicalType) { - case UINT_8: - case UINT_16: - case UINT_32: - return PrimitiveComparator.UNSIGNED_INT32_COMPARATOR; - case INT_8: - case INT_16: - case INT_32: - case DECIMAL: - case DATE: - case TIME_MILLIS: - return PrimitiveComparator.SIGNED_INT32_COMPARATOR; - default: - throw new ShouldNeverHappenException( - "No comparator logic implemented for INT32 logical type: " + logicalType); - } + return logicalType.accept(new LogicalTypeAnnotation.LogicalTypeAnnotationVisitor() { + @Override + public Optional visit(LogicalTypeAnnotation.IntLogicalTypeAnnotation intLogicalType) { + if (intLogicalType.getBitWidth() == 64) { + return empty(); + } + return intLogicalType.isSigned() ? + of(PrimitiveComparator.SIGNED_INT32_COMPARATOR) : of(PrimitiveComparator.UNSIGNED_INT32_COMPARATOR); + } + + @Override + public Optional visit(LogicalTypeAnnotation.DecimalLogicalTypeAnnotation decimalLogicalType) { + return of(PrimitiveComparator.SIGNED_INT32_COMPARATOR); + } + + @Override + public Optional visit(LogicalTypeAnnotation.DateLogicalTypeAnnotation dateLogicalType) { + return of(PrimitiveComparator.SIGNED_INT32_COMPARATOR); + } + + @Override + public Optional visit(LogicalTypeAnnotation.TimeLogicalTypeAnnotation timeLogicalType) { + if (timeLogicalType.getUnit() == MILLIS) { + return of(PrimitiveComparator.SIGNED_INT32_COMPARATOR); + } + return empty(); + } + }).orElseThrow( + () -> new ShouldNeverHappenException("No comparator logic implemented for INT32 logical type: " + logicalType)); } }, BOOLEAN("getBoolean", Boolean.TYPE) { @@ -174,7 +203,7 @@ public T convert(PrimitiveTypeNameConverter conve } @Override - PrimitiveComparator comparator(OriginalType logicalType) { + PrimitiveComparator comparator(LogicalTypeAnnotation logicalType) { return PrimitiveComparator.BOOLEAN_COMPARATOR; } }, @@ -202,22 +231,36 @@ public T convert(PrimitiveTypeNameConverter conve } @Override - PrimitiveComparator comparator(OriginalType logicalType) { + PrimitiveComparator comparator(LogicalTypeAnnotation logicalType) { if (logicalType == null) { return PrimitiveComparator.UNSIGNED_LEXICOGRAPHICAL_BINARY_COMPARATOR; } - switch (logicalType) { - case DECIMAL: - return PrimitiveComparator.BINARY_AS_SIGNED_INTEGER_COMPARATOR; - case UTF8: - case ENUM: - case JSON: - case BSON: - return PrimitiveComparator.UNSIGNED_LEXICOGRAPHICAL_BINARY_COMPARATOR; - default: - throw new ShouldNeverHappenException( - "No comparator logic implemented for BINARY logical type: " + logicalType); - } + return logicalType.accept(new LogicalTypeAnnotation.LogicalTypeAnnotationVisitor() { + @Override + public Optional visit(LogicalTypeAnnotation.DecimalLogicalTypeAnnotation decimalLogicalType) { + return of(PrimitiveComparator.BINARY_AS_SIGNED_INTEGER_COMPARATOR); + } + + @Override + public Optional visit(LogicalTypeAnnotation.StringLogicalTypeAnnotation stringLogicalType) { + return of(PrimitiveComparator.UNSIGNED_LEXICOGRAPHICAL_BINARY_COMPARATOR); + } + + @Override + public Optional visit(LogicalTypeAnnotation.EnumLogicalTypeAnnotation enumLogicalType) { + return of(PrimitiveComparator.UNSIGNED_LEXICOGRAPHICAL_BINARY_COMPARATOR); + } + + @Override + public Optional visit(LogicalTypeAnnotation.JsonLogicalTypeAnnotation jsonLogicalType) { + return of(PrimitiveComparator.UNSIGNED_LEXICOGRAPHICAL_BINARY_COMPARATOR); + } + + @Override + public Optional visit(LogicalTypeAnnotation.BsonLogicalTypeAnnotation bsonLogicalType) { + return of(PrimitiveComparator.UNSIGNED_LEXICOGRAPHICAL_BINARY_COMPARATOR); + } + }).orElseThrow(() -> new ShouldNeverHappenException("No comparator logic implemented for BINARY logical type: " + logicalType)); } }, FLOAT("getFloat", Float.TYPE) { @@ -244,7 +287,7 @@ public T convert(PrimitiveTypeNameConverter conve } @Override - PrimitiveComparator comparator(OriginalType logicalType) { + PrimitiveComparator comparator(LogicalTypeAnnotation logicalType) { return PrimitiveComparator.FLOAT_COMPARATOR; } }, @@ -272,7 +315,7 @@ public T convert(PrimitiveTypeNameConverter conve } @Override - PrimitiveComparator comparator(OriginalType logicalType) { + PrimitiveComparator comparator(LogicalTypeAnnotation logicalType) { return PrimitiveComparator.DOUBLE_COMPARATOR; } }, @@ -298,7 +341,7 @@ public T convert(PrimitiveTypeNameConverter conve } @Override - PrimitiveComparator comparator(OriginalType logicalType) { + PrimitiveComparator comparator(LogicalTypeAnnotation logicalType) { return PrimitiveComparator.BINARY_AS_SIGNED_INTEGER_COMPARATOR; } }, @@ -326,19 +369,23 @@ public T convert(PrimitiveTypeNameConverter conve } @Override - PrimitiveComparator comparator(OriginalType logicalType) { + PrimitiveComparator comparator(LogicalTypeAnnotation logicalType) { if (logicalType == null) { return PrimitiveComparator.UNSIGNED_LEXICOGRAPHICAL_BINARY_COMPARATOR; } - switch (logicalType) { - case DECIMAL: - return PrimitiveComparator.BINARY_AS_SIGNED_INTEGER_COMPARATOR; - case INTERVAL: - return PrimitiveComparator.UNSIGNED_LEXICOGRAPHICAL_BINARY_COMPARATOR; - default: - throw new ShouldNeverHappenException( - "No comparator logic implemented for FIXED_LEN_BYTE_ARRAY logical type: " + logicalType); - } + + return logicalType.accept(new LogicalTypeAnnotation.LogicalTypeAnnotationVisitor() { + @Override + public Optional visit(LogicalTypeAnnotation.DecimalLogicalTypeAnnotation decimalLogicalType) { + return of(PrimitiveComparator.BINARY_AS_SIGNED_INTEGER_COMPARATOR); + } + + @Override + public Optional visit(LogicalTypeAnnotation.IntervalLogicalTypeAnnotation intervalLogicalType) { + return of(PrimitiveComparator.UNSIGNED_LEXICOGRAPHICAL_BINARY_COMPARATOR); + } + }).orElseThrow(() -> new ShouldNeverHappenException( + "No comparator logic implemented for FIXED_LEN_BYTE_ARRAY logical type: " + logicalType)); } }; @@ -370,7 +417,7 @@ abstract public void addValueToPrimitiveConverter( abstract public T convert(PrimitiveTypeNameConverter converter) throws E; - abstract PrimitiveComparator comparator(OriginalType logicalType); + abstract PrimitiveComparator comparator(LogicalTypeAnnotation logicalType); } private final PrimitiveTypeName primitive; @@ -474,7 +521,7 @@ public PrimitiveType(Repetition repetition, PrimitiveTypeName primitive, super(name, repetition, logicalTypeAnnotation, id); this.primitive = primitive; this.length = length; - if (getOriginalType() == OriginalType.DECIMAL) { + if (logicalTypeAnnotation instanceof LogicalTypeAnnotation.DecimalLogicalTypeAnnotation) { LogicalTypeAnnotation.DecimalLogicalTypeAnnotation decimal = (LogicalTypeAnnotation.DecimalLogicalTypeAnnotation) logicalTypeAnnotation; this.decimalMeta = new DecimalMetadata(decimal.getPrecision(), decimal.getScale()); } else { @@ -482,7 +529,7 @@ public PrimitiveType(Repetition repetition, PrimitiveTypeName primitive, } if (columnOrder == null) { - columnOrder = primitive == PrimitiveTypeName.INT96 || getOriginalType() == OriginalType.INTERVAL + columnOrder = primitive == PrimitiveTypeName.INT96 || logicalTypeAnnotation instanceof LogicalTypeAnnotation.IntervalLogicalTypeAnnotation ? ColumnOrder.undefined() : ColumnOrder.typeDefined(); } @@ -494,35 +541,9 @@ private ColumnOrder requireValidColumnOrder(ColumnOrder columnOrder) { Preconditions.checkArgument(columnOrder.getColumnOrderName() == ColumnOrderName.UNDEFINED, "The column order {} is not supported by INT96", columnOrder); } - if (getOriginalType() != null) { - // Explicitly listing all the logical types to avoid having unsupported column orders new types accidentally - switch (getOriginalType()) { - case INT_8: - case INT_16: - case INT_32: - case INT_64: - case UINT_8: - case UINT_16: - case UINT_32: - case UINT_64: - case UTF8: - case DECIMAL: - case DATE: - case TIME_MILLIS: - case TIME_MICROS: - case TIMESTAMP_MILLIS: - case TIMESTAMP_MICROS: - case ENUM: - case JSON: - case BSON: - // Currently any available column order is valid - break; - case INTERVAL: - default: - Preconditions.checkArgument(columnOrder.getColumnOrderName() == ColumnOrderName.UNDEFINED, - "The column order {} is not supported by {} ({})", columnOrder, primitive, getOriginalType()); - break; - } + if (getLogicalTypeAnnotation() != null) { + Preconditions.checkArgument(getLogicalTypeAnnotation().isValidColumnOrder(columnOrder), + "The column order {} is not supported by {} ({})", columnOrder, primitive, getLogicalTypeAnnotation()); } return columnOrder; } @@ -533,7 +554,7 @@ private ColumnOrder requireValidColumnOrder(ColumnOrder columnOrder) { */ @Override public PrimitiveType withId(int id) { - return new PrimitiveType(getRepetition(), primitive, length, getName(), getOriginalType(), decimalMeta, new ID(id), + return new PrimitiveType(getRepetition(), primitive, length, getName(), getLogicalTypeAnnotation(), new ID(id), columnOrder); } @@ -712,7 +733,7 @@ protected Type union(Type toMerge, boolean strict) { if (strict) { // Can't merge primitive fields of different type names or different original types if (!primitive.equals(toMerge.asPrimitiveType().getPrimitiveTypeName()) || - getOriginalType() != toMerge.getOriginalType()) { + !Objects.equals(getLogicalTypeAnnotation(), toMerge.getLogicalTypeAnnotation())) { reportSchemaMergeError(toMerge); } @@ -734,7 +755,7 @@ protected Type union(Type toMerge, boolean strict) { builder.length(length); } - return builder.as(getOriginalType()).named(getName()); + return builder.as(getLogicalTypeAnnotation()).named(getName()); } /** @@ -747,7 +768,7 @@ protected Type union(Type toMerge, boolean strict) { */ @SuppressWarnings("unchecked") public PrimitiveComparator comparator() { - return (PrimitiveComparator) getPrimitiveTypeName().comparator(getOriginalType()); + return (PrimitiveComparator) getPrimitiveTypeName().comparator(getLogicalTypeAnnotation()); } /** @@ -762,7 +783,7 @@ public ColumnOrder columnOrder() { */ @SuppressWarnings("unchecked") public PrimitiveStringifier stringifier() { - OriginalType originalType = getOriginalType(); - return originalType == null ? PrimitiveStringifier.DEFAULT_STRINGIFIER : originalType.stringifier(this); + LogicalTypeAnnotation logicalTypeAnnotation = getLogicalTypeAnnotation(); + return logicalTypeAnnotation == null ? PrimitiveStringifier.DEFAULT_STRINGIFIER : logicalTypeAnnotation.valueStringifier(this); } } diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/Types.java b/parquet-column/src/main/java/org/apache/parquet/schema/Types.java index 165a5acea9..a1cd736580 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/Types.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/Types.java @@ -21,6 +21,7 @@ import java.util.ArrayList; import java.util.Collections; import java.util.List; +import java.util.Optional; import org.apache.parquet.Preconditions; import org.apache.parquet.schema.ColumnOrder.ColumnOrderName; @@ -441,16 +442,27 @@ protected PrimitiveType build(String name) { // validate type annotations and required metadata if (logicalTypeAnnotation != null) { - OriginalType originalType = logicalTypeAnnotation.toOriginalType(); - switch (originalType) { - case UTF8: - case JSON: - case BSON: - Preconditions.checkState( - primitiveType == PrimitiveTypeName.BINARY, - originalType.toString() + " can only annotate binary fields"); - break; - case DECIMAL: + logicalTypeAnnotation.accept(new LogicalTypeAnnotation.LogicalTypeAnnotationVisitor() { + @Override + public Optional visit(LogicalTypeAnnotation.StringLogicalTypeAnnotation stringLogicalType) { + checkBinaryPrimitiveType(stringLogicalType); + return Optional.of(true); + } + + @Override + public Optional visit(LogicalTypeAnnotation.JsonLogicalTypeAnnotation jsonLogicalType) { + checkBinaryPrimitiveType(jsonLogicalType); + return Optional.of(true); + } + + @Override + public Optional visit(LogicalTypeAnnotation.BsonLogicalTypeAnnotation bsonLogicalType) { + checkBinaryPrimitiveType(bsonLogicalType); + return Optional.of(true); + } + + @Override + public Optional visit(LogicalTypeAnnotation.DecimalLogicalTypeAnnotation decimalLogicalType) { Preconditions.checkState( (primitiveType == PrimitiveTypeName.INT32) || (primitiveType == PrimitiveTypeName.INT64) || @@ -478,40 +490,89 @@ protected PrimitiveType build(String name) { "FIXED(" + length + ") cannot store " + meta.getPrecision() + " digits (max " + maxPrecision(length) + ")"); } - break; - case DATE: - case TIME_MILLIS: - case UINT_8: - case UINT_16: - case UINT_32: - case INT_8: - case INT_16: - case INT_32: - Preconditions.checkState(primitiveType == PrimitiveTypeName.INT32, - originalType.toString() + " can only annotate INT32"); - break; - case TIME_MICROS: - case TIMESTAMP_MILLIS: - case TIMESTAMP_MICROS: - case UINT_64: - case INT_64: - Preconditions.checkState(primitiveType == PrimitiveTypeName.INT64, - originalType.toString() + " can only annotate INT64"); - break; - case INTERVAL: + return Optional.of(true); + } + + @Override + public Optional visit(LogicalTypeAnnotation.DateLogicalTypeAnnotation dateLogicalType) { + checkInt32PrimitiveType(dateLogicalType); + return Optional.of(true); + } + + @Override + public Optional visit(LogicalTypeAnnotation.TimeLogicalTypeAnnotation timeLogicalType) { + LogicalTypeAnnotation.TimeUnit unit = timeLogicalType.getUnit(); + switch (unit) { + case MILLIS: + checkInt32PrimitiveType(timeLogicalType); + break; + case MICROS: + case NANOS: + checkInt64PrimitiveType(timeLogicalType); + break; + default: + throw new RuntimeException("Invalid time unit: " + unit); + } + return Optional.of(true); + } + + @Override + public Optional visit(LogicalTypeAnnotation.IntLogicalTypeAnnotation intLogicalType) { + int bitWidth = intLogicalType.getBitWidth(); + switch (bitWidth) { + case 8: + case 16: + case 32: + checkInt32PrimitiveType(intLogicalType); + break; + case 64: + checkInt64PrimitiveType(intLogicalType); + break; + default: + throw new RuntimeException("Invalid bit width: " + bitWidth); + } + return Optional.of(true); + } + + @Override + public Optional visit(LogicalTypeAnnotation.TimestampLogicalTypeAnnotation timestampLogicalType) { + checkInt64PrimitiveType(timestampLogicalType); + return Optional.of(true); + } + + @Override + public Optional visit(LogicalTypeAnnotation.IntervalLogicalTypeAnnotation intervalLogicalType) { Preconditions.checkState( (primitiveType == PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) && (length == 12), "INTERVAL can only annotate FIXED_LEN_BYTE_ARRAY(12)"); - break; - case ENUM: + return Optional.of(true); + } + + @Override + public Optional visit(LogicalTypeAnnotation.EnumLogicalTypeAnnotation enumLogicalType) { Preconditions.checkState( primitiveType == PrimitiveTypeName.BINARY, "ENUM can only annotate binary fields"); - break; - default: - throw new IllegalStateException(originalType + " can not be applied to a primitive type"); - } + return Optional.of(true); + } + + private void checkBinaryPrimitiveType(LogicalTypeAnnotation logicalTypeAnnotation) { + Preconditions.checkState( + primitiveType == PrimitiveTypeName.BINARY, + logicalTypeAnnotation.toString() + " can only annotate binary fields"); + } + + private void checkInt32PrimitiveType(LogicalTypeAnnotation logicalTypeAnnotation) { + Preconditions.checkState(primitiveType == PrimitiveTypeName.INT32, + logicalTypeAnnotation.toString() + " can only annotate INT32"); + } + + private void checkInt64PrimitiveType(LogicalTypeAnnotation logicalTypeAnnotation) { + Preconditions.checkState(primitiveType == PrimitiveTypeName.INT64, + logicalTypeAnnotation.toString() + " can only annotate INT64"); + } + }).orElseThrow(() -> new IllegalStateException(logicalTypeAnnotation + " can not be applied to a primitive type")); } if (newLogicalTypeSet) { @@ -531,7 +592,7 @@ private static long maxPrecision(int numBytes) { protected DecimalMetadata decimalMetadata() { DecimalMetadata meta = null; - if (OriginalType.DECIMAL == getOriginalType()) { + if (logicalTypeAnnotation instanceof LogicalTypeAnnotation.DecimalLogicalTypeAnnotation) { LogicalTypeAnnotation.DecimalLogicalTypeAnnotation decimalType = (LogicalTypeAnnotation.DecimalLogicalTypeAnnotation) logicalTypeAnnotation; if (newLogicalTypeSet) { if (scaleAlreadySet) { diff --git a/parquet-column/src/test/java/org/apache/parquet/column/impl/TestColumnReaderImpl.java b/parquet-column/src/test/java/org/apache/parquet/column/impl/TestColumnReaderImpl.java index d2d78c43d1..35fddaf0b0 100644 --- a/parquet-column/src/test/java/org/apache/parquet/column/impl/TestColumnReaderImpl.java +++ b/parquet-column/src/test/java/org/apache/parquet/column/impl/TestColumnReaderImpl.java @@ -65,10 +65,10 @@ public void test() throws Exception { for (int i = 0; i < rows; i++) { columnWriterV2.write(Binary.fromString("bar" + i % 10), 0, 0); if ((i + 1) % 1000 == 0) { - columnWriterV2.writePage(i); + columnWriterV2.writePage(); } } - columnWriterV2.writePage(rows); + columnWriterV2.writePage(); columnWriterV2.finalizeColumnChunk(); List pages = pageWriter.getPages(); int valueCount = 0; @@ -103,10 +103,10 @@ public void testOptional() throws Exception { for (int i = 0; i < rows; i++) { columnWriterV2.writeNull(0, 0); if ((i + 1) % 1000 == 0) { - columnWriterV2.writePage(i); + columnWriterV2.writePage(); } } - columnWriterV2.writePage(rows); + columnWriterV2.writePage(); columnWriterV2.finalizeColumnChunk(); List pages = pageWriter.getPages(); int valueCount = 0; diff --git a/parquet-column/src/test/java/org/apache/parquet/column/mem/TestMemColumn.java b/parquet-column/src/test/java/org/apache/parquet/column/mem/TestMemColumn.java index c855339c59..e5db38c945 100644 --- a/parquet-column/src/test/java/org/apache/parquet/column/mem/TestMemColumn.java +++ b/parquet-column/src/test/java/org/apache/parquet/column/mem/TestMemColumn.java @@ -20,12 +20,10 @@ import static org.junit.Assert.assertEquals; -import org.apache.parquet.column.ParquetProperties; -import org.junit.Test; - import org.apache.parquet.column.ColumnDescriptor; import org.apache.parquet.column.ColumnReader; import org.apache.parquet.column.ColumnWriter; +import org.apache.parquet.column.ParquetProperties; import org.apache.parquet.column.impl.ColumnReadStoreImpl; import org.apache.parquet.column.impl.ColumnWriteStoreV1; import org.apache.parquet.column.page.mem.MemPageStore; @@ -33,6 +31,7 @@ import org.apache.parquet.io.api.Binary; import org.apache.parquet.schema.MessageType; import org.apache.parquet.schema.MessageTypeParser; +import org.junit.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -47,6 +46,7 @@ public void testMemColumn() throws Exception { ColumnWriteStoreV1 memColumnsStore = newColumnWriteStoreImpl(memPageStore); ColumnWriter columnWriter = memColumnsStore.getColumnWriter(path); columnWriter.write(42l, 0, 0); + memColumnsStore.endRecord(); memColumnsStore.flush(); ColumnReader columnReader = getColumnReader(memPageStore, path, schema); @@ -85,6 +85,7 @@ public void testMemColumnBinary() throws Exception { ColumnWriter columnWriter = memColumnsStore.getColumnWriter(path); columnWriter.write(Binary.fromString("42"), 0, 0); + memColumnsStore.endRecord(); memColumnsStore.flush(); ColumnReader columnReader = getColumnReader(memPageStore, path, mt); @@ -108,6 +109,7 @@ public void testMemColumnSeveralPages() throws Exception { ColumnWriter columnWriter = memColumnsStore.getColumnWriter(path); for (int i = 0; i < 2000; i++) { columnWriter.write(42l, 0, 0); + memColumnsStore.endRecord(); } memColumnsStore.flush(); @@ -136,12 +138,16 @@ public void testMemColumnSeveralPagesRepeated() throws Exception { int r = rs[i % rs.length]; int d = ds[i % ds.length]; LOG.debug("write i: {}", i); + if (i != 0 && r == 0) { + memColumnsStore.endRecord(); + } if (d == 2) { columnWriter.write((long)i, r, d); } else { columnWriter.writeNull(r, d); } } + memColumnsStore.endRecord(); memColumnsStore.flush(); ColumnReader columnReader = getColumnReader(memPageStore, path, mt); diff --git a/parquet-column/src/test/java/org/apache/parquet/column/page/mem/MemPageWriter.java b/parquet-column/src/test/java/org/apache/parquet/column/page/mem/MemPageWriter.java index be3a0f9cb4..706b00110d 100644 --- a/parquet-column/src/test/java/org/apache/parquet/column/page/mem/MemPageWriter.java +++ b/parquet-column/src/test/java/org/apache/parquet/column/page/mem/MemPageWriter.java @@ -56,6 +56,12 @@ public void writePage(BytesInput bytesInput, int valueCount, Statistics statisti LOG.debug("page written for {} bytes and {} records", bytesInput.size(), valueCount); } + @Override + public void writePage(BytesInput bytesInput, int valueCount, int rowCount, Statistics statistics, + Encoding rlEncoding, Encoding dlEncoding, Encoding valuesEncoding) throws IOException { + writePage(bytesInput, valueCount, statistics, rlEncoding, dlEncoding, valuesEncoding); + } + @Override public void writePageV2(int rowCount, int nullCount, int valueCount, BytesInput repetitionLevels, BytesInput definitionLevels, diff --git a/parquet-column/src/test/java/org/apache/parquet/column/values/bitpacking/TestBitPackingColumn.java b/parquet-column/src/test/java/org/apache/parquet/column/values/bitpacking/TestBitPackingColumn.java index 867af2876d..3ca3d0898d 100644 --- a/parquet-column/src/test/java/org/apache/parquet/column/values/bitpacking/TestBitPackingColumn.java +++ b/parquet-column/src/test/java/org/apache/parquet/column/values/bitpacking/TestBitPackingColumn.java @@ -183,6 +183,22 @@ private void validateEncodeDecode(int bitLength, int[] vals, String expected) th } LOG.debug("result: {}", TestBitPacking.toString(result)); assertArrayEquals(type + " result: " + TestBitPacking.toString(result), vals, result); + + // Test skipping + r.initFromPage(vals.length, ByteBufferInputStream.wrap(ByteBuffer.wrap(bytes))); + for (int i = 0; i < vals.length; i += 2) { + assertEquals(vals[i], r.readInteger()); + r.skip(); + } + + // Test n-skipping + r.initFromPage(vals.length, ByteBufferInputStream.wrap(ByteBuffer.wrap(bytes))); + int skipCount; + for (int i = 0; i < vals.length; i += skipCount + 1) { + skipCount = (vals.length - i) / 2; + assertEquals(vals[i], r.readInteger()); + r.skip(skipCount); + } } } diff --git a/parquet-column/src/test/java/org/apache/parquet/column/values/bloomfilter/TestBlockSplitBloomFilter.java b/parquet-column/src/test/java/org/apache/parquet/column/values/bloomfilter/TestBlockSplitBloomFilter.java index 542b9cd25a..0f85195706 100644 --- a/parquet-column/src/test/java/org/apache/parquet/column/values/bloomfilter/TestBlockSplitBloomFilter.java +++ b/parquet-column/src/test/java/org/apache/parquet/column/values/bloomfilter/TestBlockSplitBloomFilter.java @@ -16,46 +16,39 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.parquet.column.values.bloomfilter; - - import java.io.File; - import java.io.FileInputStream; - import java.io.FileOutputStream; - import java.io.IOException; - import java.nio.ByteBuffer; - import java.nio.ByteOrder; - import java.util.ArrayList; - import java.util.List; - import java.util.Random; - - import jdk.nashorn.internal.ir.Block; - import org.apache.parquet.column.values.RandomStr; - import org.apache.parquet.io.api.Binary; - import org.junit.Rule; - import org.junit.Test; - import org.junit.rules.TemporaryFolder; - - import static org.junit.Assert.assertEquals; - import static org.junit.Assert.assertTrue; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.ArrayList; +import java.util.List; +import java.util.Random; +import org.apache.parquet.column.values.RandomStr; +import org.apache.parquet.io.api.Binary; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; public class TestBlockSplitBloomFilter { - @Test public void testConstructor () throws IOException { BloomFilter bloomFilter1 = new BlockSplitBloomFilter(0); assertEquals(bloomFilter1.getBitsetSize(), BlockSplitBloomFilter.MINIMUM_BLOOM_FILTER_BYTES); - BloomFilter bloomFilter2 = new BlockSplitBloomFilter(256 * 1024 * 1024); assertEquals(bloomFilter2.getBitsetSize(), BlockSplitBloomFilter.MAXIMUM_BLOOM_FILTER_BYTES); - BloomFilter bloomFilter3 = new BlockSplitBloomFilter(1000); assertEquals(bloomFilter3.getBitsetSize(), 1024); } @Rule public final TemporaryFolder temp = new TemporaryFolder(); + /* * This test is used to test basic operations including inserting, finding and * serializing and de-serializing. @@ -73,11 +66,9 @@ public void testBasic () throws IOException { FileOutputStream fileOutputStream = new FileOutputStream(testFile); bloomFilter.writeTo(fileOutputStream); fileOutputStream.close(); - FileInputStream fileInputStream = new FileInputStream(testFile); byte[] value = new byte[4]; - fileInputStream.read(value); int length = ByteBuffer.wrap(value).order(ByteOrder.LITTLE_ENDIAN).getInt(); assertEquals(length, 1024); @@ -93,7 +84,6 @@ public void testBasic () throws IOException { byte[] bitset = new byte[length]; fileInputStream.read(bitset); bloomFilter = new BlockSplitBloomFilter(bitset); - for(int i = 0; i < testStrings.length; i++) { assertTrue(bloomFilter.find(bloomFilter.hash(Binary.fromString(testStrings[i])))); } @@ -122,7 +112,7 @@ public void testFPP() throws IOException { exist ++; } } - + // The exist should be probably less than 1000 according FPP 0.01. assertTrue(exist < totalCount * FPP); } diff --git a/parquet-column/src/test/java/org/apache/parquet/column/values/delta/DeltaBinaryPackingValuesWriterForIntegerTest.java b/parquet-column/src/test/java/org/apache/parquet/column/values/delta/DeltaBinaryPackingValuesWriterForIntegerTest.java index df99e3c740..c69e0ff9c1 100644 --- a/parquet-column/src/test/java/org/apache/parquet/column/values/delta/DeltaBinaryPackingValuesWriterForIntegerTest.java +++ b/parquet-column/src/test/java/org/apache/parquet/column/values/delta/DeltaBinaryPackingValuesWriterForIntegerTest.java @@ -212,6 +212,23 @@ public void shouldSkip() throws IOException { } } + @Test + public void shouldSkipN() throws IOException { + int[] data = new int[5 * blockSize + 1]; + for (int i = 0; i < data.length; i++) { + data[i] = i * 32; + } + writeData(data); + reader = new DeltaBinaryPackingValuesReader(); + reader.initFromPage(100, writer.getBytes().toInputStream()); + int skipCount; + for (int i = 0; i < data.length; i += skipCount + 1) { + skipCount = (data.length - i) / 2; + assertEquals(i * 32, reader.readInteger()); + reader.skip(skipCount); + } + } + @Test public void shouldReset() throws IOException { shouldReadWriteWhenDataIsNotAlignedWithBlock(); diff --git a/parquet-column/src/test/java/org/apache/parquet/column/values/delta/DeltaBinaryPackingValuesWriterForLongTest.java b/parquet-column/src/test/java/org/apache/parquet/column/values/delta/DeltaBinaryPackingValuesWriterForLongTest.java index 65ac819e8c..ca12bbdb82 100644 --- a/parquet-column/src/test/java/org/apache/parquet/column/values/delta/DeltaBinaryPackingValuesWriterForLongTest.java +++ b/parquet-column/src/test/java/org/apache/parquet/column/values/delta/DeltaBinaryPackingValuesWriterForLongTest.java @@ -211,6 +211,23 @@ public void shouldSkip() throws IOException { } } + @Test + public void shouldSkipN() throws IOException { + long[] data = new long[5 * blockSize + 1]; + for (int i = 0; i < data.length; i++) { + data[i] = i * 32; + } + writeData(data); + reader = new DeltaBinaryPackingValuesReader(); + reader.initFromPage(100, writer.getBytes().toInputStream()); + int skipCount; + for (int i = 0; i < data.length; i += skipCount + 1) { + skipCount = (data.length - i) / 2; + assertEquals(i * 32, reader.readLong()); + reader.skip(skipCount); + } + } + @Test public void shouldReset() throws IOException { shouldReadWriteWhenDataIsNotAlignedWithBlock(); diff --git a/parquet-column/src/test/java/org/apache/parquet/column/values/deltalengthbytearray/TestDeltaLengthByteArray.java b/parquet-column/src/test/java/org/apache/parquet/column/values/deltalengthbytearray/TestDeltaLengthByteArray.java index d214a88980..6c974307b7 100644 --- a/parquet-column/src/test/java/org/apache/parquet/column/values/deltalengthbytearray/TestDeltaLengthByteArray.java +++ b/parquet-column/src/test/java/org/apache/parquet/column/values/deltalengthbytearray/TestDeltaLengthByteArray.java @@ -64,6 +64,30 @@ public void testRandomStrings() throws IOException { } } + @Test + public void testSkipWithRandomStrings() throws IOException { + DeltaLengthByteArrayValuesWriter writer = getDeltaLengthByteArrayValuesWriter(); + DeltaLengthByteArrayValuesReader reader = new DeltaLengthByteArrayValuesReader(); + + String[] values = Utils.getRandomStringSamples(1000, 32); + Utils.writeData(writer, values); + + reader.initFromPage(values.length, writer.getBytes().toInputStream()); + for (int i = 0; i < values.length; i += 2) { + Assert.assertEquals(Binary.fromString(values[i]), reader.readBytes()); + reader.skip(); + } + + reader = new DeltaLengthByteArrayValuesReader(); + reader.initFromPage(values.length, writer.getBytes().toInputStream()); + int skipCount; + for (int i = 0; i < values.length; i += skipCount + 1) { + skipCount = (values.length - i) / 2; + Assert.assertEquals(Binary.fromString(values[i]), reader.readBytes()); + reader.skip(skipCount); + } + } + @Test public void testLengths() throws IOException { DeltaLengthByteArrayValuesWriter writer = getDeltaLengthByteArrayValuesWriter(); diff --git a/parquet-column/src/test/java/org/apache/parquet/column/values/deltastrings/TestDeltaByteArray.java b/parquet-column/src/test/java/org/apache/parquet/column/values/deltastrings/TestDeltaByteArray.java index c13a3a2b87..a5a22a8dbf 100644 --- a/parquet-column/src/test/java/org/apache/parquet/column/values/deltastrings/TestDeltaByteArray.java +++ b/parquet-column/src/test/java/org/apache/parquet/column/values/deltastrings/TestDeltaByteArray.java @@ -58,6 +58,13 @@ public void testRandomStringsWithSkip() throws Exception { assertReadWriteWithSkip(writer, reader, randvalues); } + @Test + public void testRandomStringsWithSkipN() throws Exception { + DeltaByteArrayWriter writer = new DeltaByteArrayWriter(64 * 1024, 64 * 1024, new DirectByteBufferAllocator()); + DeltaByteArrayReader reader = new DeltaByteArrayReader(); + assertReadWriteWithSkipN(writer, reader, randvalues); + } + @Test public void testLengths() throws IOException { DeltaByteArrayWriter writer = new DeltaByteArrayWriter(64 * 1024, 64 * 1024, new DirectByteBufferAllocator()); @@ -99,6 +106,18 @@ private void assertReadWriteWithSkip(DeltaByteArrayWriter writer, DeltaByteArray } } + private void assertReadWriteWithSkipN(DeltaByteArrayWriter writer, DeltaByteArrayReader reader, String[] vals) throws Exception { + Utils.writeData(writer, vals); + + reader.initFromPage(vals.length, writer.getBytes().toInputStream()); + int skipCount; + for (int i = 0; i < vals.length; i += skipCount + 1) { + skipCount = (vals.length - i) / 2; + Assert.assertEquals(Binary.fromString(vals[i]), reader.readBytes()); + reader.skip(skipCount); + } + } + @Test public void testWriterReset() throws Exception { DeltaByteArrayWriter writer = new DeltaByteArrayWriter(64 * 1024, 64 * 1024, new DirectByteBufferAllocator()); diff --git a/parquet-column/src/test/java/org/apache/parquet/column/values/dictionary/TestDictionary.java b/parquet-column/src/test/java/org/apache/parquet/column/values/dictionary/TestDictionary.java index ef2b7215dd..ba3f9034ad 100644 --- a/parquet-column/src/test/java/org/apache/parquet/column/values/dictionary/TestDictionary.java +++ b/parquet-column/src/test/java/org/apache/parquet/column/values/dictionary/TestDictionary.java @@ -99,6 +99,47 @@ public void testBinaryDictionary() throws IOException { checkDistinct(COUNT, bytes3, cr2, "c"); } + @Test + public void testSkipInBinaryDictionary() throws Exception { + ValuesWriter cw = newPlainBinaryDictionaryValuesWriter(1000, 10000); + writeRepeated(100, cw, "a"); + writeDistinct(100, cw, "b"); + assertEquals(PLAIN_DICTIONARY, cw.getEncoding()); + + // Test skip and skip-n with dictionary encoding + ByteBufferInputStream stream = cw.getBytes().toInputStream(); + DictionaryValuesReader cr = initDicReader(cw, BINARY); + cr.initFromPage(200, stream); + for (int i = 0; i < 100; i += 2) { + assertEquals(Binary.fromString("a" + i % 10), cr.readBytes()); + cr.skip(); + } + int skipCount; + for (int i = 0; i < 100; i += skipCount + 1) { + skipCount = (100 - i) / 2; + assertEquals(Binary.fromString("b" + i), cr.readBytes()); + cr.skip(skipCount); + } + + // Ensure fallback + writeDistinct(1000, cw, "c"); + assertEquals(PLAIN, cw.getEncoding()); + + // Test skip and skip-n with plain encoding (after fallback) + ValuesReader plainReader = new BinaryPlainValuesReader(); + plainReader.initFromPage(1200, cw.getBytes().toInputStream()); + plainReader.skip(200); + for (int i = 0; i < 100; i += 2) { + assertEquals("c" + i, plainReader.readBytes().toStringUsingUTF8()); + plainReader.skip(); + } + for (int i = 100; i < 1000; i += skipCount + 1) { + skipCount = (1000 - i) / 2; + assertEquals(Binary.fromString("c" + i), plainReader.readBytes()); + plainReader.skip(skipCount); + } + } + @Test public void testBinaryDictionaryFallBack() throws IOException { int slabSize = 100; @@ -234,6 +275,22 @@ private void roundTripLong(FallbackValuesWriter unable to increment + assertEquals( + Binary.fromString( + UTF8_1BYTE_MAX_CHAR + + UTF8_2BYTES_MAX_CHAR), + truncator.truncateMin(Binary.fromString( + UTF8_1BYTE_MAX_CHAR + + UTF8_2BYTES_MAX_CHAR + + UTF8_3BYTES_MAX_CHAR + + UTF8_4BYTES_MAX_CHAR), + 5)); + assertEquals( + Binary.fromString( + UTF8_1BYTE_MAX_CHAR + + UTF8_2BYTES_MAX_CHAR + + UTF8_3BYTES_MAX_CHAR + + UTF8_4BYTES_MAX_CHAR), + truncator.truncateMax(Binary.fromString( + UTF8_1BYTE_MAX_CHAR + + UTF8_2BYTES_MAX_CHAR + + UTF8_3BYTES_MAX_CHAR + + UTF8_4BYTES_MAX_CHAR), + 5)); + + // Truncate highest UTF-8 values at the end -> increment the first possible character + assertEquals( + Binary.fromString( + UTF8_1BYTE_MAX_CHAR + + UTF8_2BYTES_MAX_CHAR + + "b" + + UTF8_3BYTES_MAX_CHAR), + truncator.truncateMax(Binary.fromString( + UTF8_1BYTE_MAX_CHAR + + UTF8_2BYTES_MAX_CHAR + + "a" + + UTF8_3BYTES_MAX_CHAR + + UTF8_4BYTES_MAX_CHAR), + 10)); + + // Truncate invalid UTF-8 values -> truncate without validity check + assertEquals(binary(0xFF, 0xFE, 0xFD), truncator.truncateMin(binary(0xFF, 0xFE, 0xFD, 0xFC, 0xFB, 0xFA), 3)); + assertEquals(binary(0xFF, 0xFE, 0xFE), truncator.truncateMax(binary(0xFF, 0xFE, 0xFD, 0xFC, 0xFB, 0xFA), 3)); + assertEquals(binary(0xFF, 0xFE, 0xFE, 0x00, 0x00), truncator.truncateMax(binary(0xFF, 0xFE, 0xFD, 0xFF, 0xFF, 0xFF), 5)); + } + + @Test + public void testContractStringTypes() { + testTruncator(Types.required(BINARY).named("test_binary"), true); + testTruncator(Types.required(BINARY).as(UTF8).named("test_utf8"), true); + testTruncator(Types.required(BINARY).as(ENUM).named("test_enum"), true); + testTruncator(Types.required(BINARY).as(JSON).named("test_json"), true); + testTruncator(Types.required(BINARY).as(BSON).named("test_bson"), true); + testTruncator(Types.required(FIXED_LEN_BYTE_ARRAY).length(5).named("test_fixed"), true); + } + + private void testTruncator(PrimitiveType type, boolean strict) { + BinaryTruncator truncator = BinaryTruncator.getTruncator(type); + Comparator comparator = type.comparator(); + + checkContract(truncator, comparator, Binary.fromString("aaaaaaaaaa"), strict, strict); + checkContract(truncator, comparator, Binary.fromString("árvíztűrő tükörfúrógép"), strict, strict); + checkContract(truncator, comparator, Binary.fromString("aaaaaaaaaa" + UTF8_3BYTES_MAX_CHAR), strict, strict); + checkContract(truncator, comparator, Binary.fromString("a" + UTF8_3BYTES_MAX_CHAR + UTF8_1BYTE_MAX_CHAR), strict, + strict); + + checkContract(truncator, comparator, + Binary.fromConstantByteArray(new byte[] { (byte) 0xFE, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, (byte) 0xFF }), strict, + strict); + + // Edge case: zero length -> unable to truncate + checkContract(truncator, comparator, Binary.fromString(""), false, false); + // Edge case: containing only UTF-8 max characters -> unable to truncate for max + checkContract(truncator, comparator, Binary.fromString( + UTF8_1BYTE_MAX_CHAR + + UTF8_4BYTES_MAX_CHAR + + UTF8_3BYTES_MAX_CHAR + + UTF8_4BYTES_MAX_CHAR + + UTF8_2BYTES_MAX_CHAR + + UTF8_3BYTES_MAX_CHAR + + UTF8_3BYTES_MAX_CHAR + + UTF8_1BYTE_MAX_CHAR + + UTF8_2BYTES_MAX_CHAR + + UTF8_3BYTES_MAX_CHAR + + UTF8_4BYTES_MAX_CHAR), + strict, false); + // Edge case: non-UTF-8; max bytes -> unable to truncate for max + checkContract( + truncator, comparator, + binary(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF), + strict, false); + } + + // Checks the contract of truncator + // strict means actual truncation is required and the truncated value is a valid UTF-8 string + private void checkContract(BinaryTruncator truncator, Comparator comparator, Binary value, boolean strictMin, + boolean strictMax) { + int length = value.length(); + + // Edge cases: returning the original value if no truncation is required + assertSame(value, truncator.truncateMin(value, length)); + assertSame(value, truncator.truncateMax(value, length)); + assertSame(value, truncator.truncateMin(value, random(length + 1, length * 2 + 1))); + assertSame(value, truncator.truncateMax(value, random(length + 1, length * 2 + 1))); + + if (length > 1) { + checkMinContract(truncator, comparator, value, length - 1, strictMin); + checkMaxContract(truncator, comparator, value, length - 1, strictMax); + checkMinContract(truncator, comparator, value, random(1, length - 1), strictMin); + checkMaxContract(truncator, comparator, value, random(1, length - 1), strictMax); + } + + // Edge case: possible to truncate min value to 0 length if original value is not empty + checkMinContract(truncator, comparator, value, 0, strictMin); + // Edge case: impossible to truncate max value to 0 length -> returning the original value + assertSame(value, truncator.truncateMax(value, 0)); + } + + private void checkMinContract(BinaryTruncator truncator, Comparator comparator, Binary value, int length, + boolean strict) { + Binary truncated = truncator.truncateMin(value, length); + LOG.debug("\"{}\" --truncMin({})--> \"{}\" [{}]", value.toStringUsingUTF8(), length, truncated.toStringUsingUTF8(), + HEXA_STRINGIFIER.stringify(truncated)); + assertTrue("truncatedMin(value) should be <= than value", comparator.compare(truncated, value) <= 0); + assertFalse("length of truncateMin(value) should not be > than the length of value", + truncated.length() > value.length()); + if (isValidUtf8(value)) { + checkValidUtf8(truncated); + } + if (strict) { + assertTrue("length of truncateMin(value) ahould be < than the length of value", + truncated.length() < value.length()); + } + } + + private void checkMaxContract(BinaryTruncator truncator, Comparator comparator, Binary value, int length, + boolean strict) { + Binary truncated = truncator.truncateMax(value, length); + LOG.debug("\"{}\" --truncMax({})--> \"{}\" [{}]", value.toStringUsingUTF8(), length, truncated.toStringUsingUTF8(), + HEXA_STRINGIFIER.stringify(truncated)); + assertTrue("truncatedMax(value) should be >= than value", comparator.compare(truncated, value) >= 0); + assertFalse("length of truncateMax(value) should not be > than the length of value", + truncated.length() > value.length()); + if (isValidUtf8(value)) { + checkValidUtf8(truncated); + } + if (strict) { + assertTrue("length of truncateMax(value) ahould be < than the length of value", + truncated.length() < value.length()); + } + } + + private static boolean isValidUtf8(Binary binary) { + try { + UTF8_DECODER.decode(binary.toByteBuffer()); + return true; + } catch (CharacterCodingException e) { + return false; + } + } + + private static void checkValidUtf8(Binary binary) { + try { + UTF8_DECODER.decode(binary.toByteBuffer()); + } catch (CharacterCodingException e) { + throw new AssertionError("Truncated value should be a valid UTF-8 string", e); + } + } + + private static int random(int min, int max) { + return RANDOM.nextInt(max - min + 1) + min; + } + + private static Binary binary(int... unsignedBytes) { + byte[] byteArray = new byte[unsignedBytes.length]; + for (int i = 0, n = byteArray.length; i < n; ++i) { + int b = unsignedBytes[i]; + assert (0xFFFFFF00 & b) == 0; + byteArray[i] = (byte) b; + } + return Binary.fromConstantByteArray(byteArray); + } + +} diff --git a/parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestBoundaryOrder.java b/parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestBoundaryOrder.java new file mode 100644 index 0000000000..3d2a924217 --- /dev/null +++ b/parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestBoundaryOrder.java @@ -0,0 +1,487 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.internal.column.columnindex; + +import java.nio.ByteBuffer; +import java.util.Collections; +import java.util.Iterator; +import java.util.PrimitiveIterator; +import java.util.Random; +import java.util.function.Function; +import java.util.stream.IntStream; + +import org.apache.parquet.column.statistics.Statistics; +import org.apache.parquet.internal.column.columnindex.ColumnIndexBuilder.ColumnIndexBase; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; +import org.apache.parquet.schema.Types; +import org.junit.Assert; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import it.unimi.dsi.fastutil.ints.IntArrayList; +import it.unimi.dsi.fastutil.ints.IntList; + +/** + * Tests the operator implementations in {@link BoundaryOrder}. + */ +public class TestBoundaryOrder { + private static class SpyValueComparatorBuilder extends ColumnIndexBase { + class SpyValueComparator extends ValueComparator { + private final ColumnIndexBase.ValueComparator delegate; + private int compareCount; + + SpyValueComparator(ColumnIndexBase.ValueComparator delegate) { + this.delegate = delegate; + } + + int getCompareCount() { + return compareCount; + } + + @Override + int arrayLength() { + return delegate.arrayLength(); + } + + @Override + int translate(int arrayIndex) { + return delegate.translate(arrayIndex); + } + + @Override + int compareValueToMin(int arrayIndex) { + ++compareCount; + return delegate.compareValueToMin(arrayIndex); + } + + @Override + int compareValueToMax(int arrayIndex) { + ++compareCount; + return delegate.compareValueToMax(arrayIndex); + } + } + + private SpyValueComparatorBuilder() { + super(TYPE); + } + + SpyValueComparator build(ColumnIndexBase.ValueComparator comparator) { + return new SpyValueComparator(comparator); + } + + @Override + ByteBuffer getMinValueAsBytes(int arrayIndex) { + throw new Error("Shall never be invoked"); + } + + @Override + ByteBuffer getMaxValueAsBytes(int arrayIndex) { + throw new Error("Shall never be invoked"); + } + + @Override + String getMinValueAsString(int arrayIndex) { + throw new Error("Shall never be invoked"); + } + + @Override + String getMaxValueAsString(int arrayIndex) { + throw new Error("Shall never be invoked"); + } + + @Override + > org.apache.parquet.filter2.predicate.Statistics createStats(int arrayIndex) { + throw new Error("Shall never be invoked"); + } + + @Override + ColumnIndexBase.ValueComparator createValueComparator(Object value) { + throw new Error("Shall never be invoked"); + } + } + + private static class ExecStats { + private long linearTime; + private long binaryTime; + private int linearCompareCount; + private int binaryCompareCount; + private int execCount; + + IntList measureLinear(Function.ValueComparator, PrimitiveIterator.OfInt> op, + ColumnIndexBase.ValueComparator comparator) { + IntList list = new IntArrayList(comparator.arrayLength()); + SpyValueComparatorBuilder.SpyValueComparator spyComparator = SPY_COMPARATOR_BUILDER.build(comparator); + long start = System.nanoTime(); + op.apply(spyComparator).forEachRemaining((int value) -> list.add(value)); + linearTime = System.nanoTime() - start; + linearCompareCount += spyComparator.getCompareCount(); + return list; + } + + IntList measureBinary(Function.ValueComparator, PrimitiveIterator.OfInt> op, + ColumnIndexBase.ValueComparator comparator) { + IntList list = new IntArrayList(comparator.arrayLength()); + SpyValueComparatorBuilder.SpyValueComparator spyComparator = SPY_COMPARATOR_BUILDER.build(comparator); + long start = System.nanoTime(); + op.apply(spyComparator).forEachRemaining((int value) -> list.add(value)); + binaryTime = System.nanoTime() - start; + binaryCompareCount += spyComparator.getCompareCount(); + return list; + } + + void add(ExecStats stats) { + linearTime += stats.linearTime; + linearCompareCount += stats.linearCompareCount; + binaryTime += stats.binaryTime; + binaryCompareCount += stats.binaryCompareCount; + ++execCount; + } + + @Override + public String toString() { + double linearMs = linearTime / 1_000_000.0; + double binaryMs = binaryTime / 1_000_000.0; + return String.format( + "Linear search: %.2fms (avg: %.6fms); number of compares: %d (avg: %d) [100.00%%]%n" + + "Binary search: %.2fms (avg: %.6fms); number of compares: %d (avg: %d) [%.2f%%]", + linearMs, linearMs / execCount, linearCompareCount, linearCompareCount / execCount, + binaryMs, binaryMs / execCount, binaryCompareCount, binaryCompareCount / execCount, + 100.0 * binaryCompareCount / linearCompareCount); + } + } + + private static final Logger LOGGER = LoggerFactory.getLogger(TestBoundaryOrder.class); + private static final PrimitiveType TYPE = Types.required(PrimitiveTypeName.INT32).named("test_int32"); + private static final int FROM = -15; + private static final int TO = 15; + private static final ColumnIndexBase ASCENDING; + private static final ColumnIndexBase DESCENDING; + private static final int SINGLE_FROM = -1; + private static final int SINGLE_TO = 1; + private static final ColumnIndexBase SINGLE; + private static final Random RANDOM = new Random(42); + private static final int RAND_FROM = -2000; + private static final int RAND_TO = 2000; + private static final int RAND_COUNT = 2000; + private static final ColumnIndexBase RAND_ASCENDING; + private static final ColumnIndexBase RAND_DESCENDING; + private static final SpyValueComparatorBuilder SPY_COMPARATOR_BUILDER = new SpyValueComparatorBuilder(); + static { + ColumnIndexBuilder builder = ColumnIndexBuilder.getBuilder(TYPE, Integer.MAX_VALUE); + builder.add(stats(FROM, -12)); + builder.add(stats(-10, -8)); + builder.add(stats(-8, -4)); + builder.add(stats(-6, -4)); + builder.add(stats(-6, -3)); + builder.add(stats(-6, -3)); + builder.add(stats(-6, -3)); + builder.add(stats(0, 3)); + builder.add(stats(3, 5)); + builder.add(stats(3, 5)); + builder.add(stats(5, 8)); + builder.add(stats(10, TO)); + ASCENDING = (ColumnIndexBase) builder.build(); + + builder = ColumnIndexBuilder.getBuilder(TYPE, Integer.MAX_VALUE); + builder.add(stats(10, TO)); + builder.add(stats(5, 8)); + builder.add(stats(3, 5)); + builder.add(stats(3, 5)); + builder.add(stats(0, 3)); + builder.add(stats(-6, -3)); + builder.add(stats(-6, -3)); + builder.add(stats(-6, -3)); + builder.add(stats(-6, -4)); + builder.add(stats(-8, -4)); + builder.add(stats(-10, -8)); + builder.add(stats(FROM, -12)); + DESCENDING = (ColumnIndexBase) builder.build(); + + builder = ColumnIndexBuilder.getBuilder(TYPE, Integer.MAX_VALUE); + builder.add(stats(SINGLE_FROM, SINGLE_TO)); + SINGLE = (ColumnIndexBase) builder.build(); + + builder = ColumnIndexBuilder.getBuilder(TYPE, Integer.MAX_VALUE); + for (PrimitiveIterator.OfInt it = IntStream.generate(() -> RANDOM.nextInt(RAND_TO - RAND_FROM + 1) + RAND_FROM) + .limit(RAND_COUNT * 2).sorted().iterator(); it.hasNext();) { + builder.add(stats(it.nextInt(), it.nextInt())); + } + RAND_ASCENDING = (ColumnIndexBase) builder.build(); + + builder = ColumnIndexBuilder.getBuilder(TYPE, Integer.MAX_VALUE); + for (Iterator it = IntStream.generate(() -> RANDOM.nextInt(RAND_TO - RAND_FROM + 1) + RAND_FROM) + .limit(RAND_COUNT * 2).mapToObj(Integer::valueOf).sorted(Collections.reverseOrder()).iterator(); it + .hasNext();) { + builder.add(stats(it.next(), it.next())); + } + RAND_DESCENDING = (ColumnIndexBase) builder.build(); + } + + private static Statistics stats(int min, int max) { + Statistics stats = Statistics.createStats(TYPE); + stats.updateStats(min); + stats.updateStats(max); + return stats; + } + + private static ExecStats validateOperator(String msg, + Function.ValueComparator, PrimitiveIterator.OfInt> validatorOp, + Function.ValueComparator, PrimitiveIterator.OfInt> actualOp, + ColumnIndexBase.ValueComparator comparator) { + ExecStats stats = new ExecStats(); + + IntList expected = stats.measureLinear(validatorOp, comparator); + IntList actual = stats.measureBinary(actualOp, comparator); + + Assert.assertEquals(msg, expected, actual); + + return stats; + } + + @Test + public void testEq() { + for (int i = FROM - 1; i <= TO + 1; ++i) { + validateOperator("Mismatching page indexes for value " + i + " with ASCENDING order", + BoundaryOrder.UNORDERED::eq, + BoundaryOrder.ASCENDING::eq, + ASCENDING.createValueComparator(i)); + validateOperator("Mismatching page indexes for value " + i + " with DESCENDING order", + BoundaryOrder.UNORDERED::eq, + BoundaryOrder.DESCENDING::eq, + DESCENDING.createValueComparator(i)); + } + for (int i = SINGLE_FROM - 1; i <= SINGLE_TO + 1; ++i) { + ColumnIndexBase.ValueComparator singleComparator = SINGLE.createValueComparator(i); + validateOperator("Mismatching page indexes for value " + i + " with ASCENDING order", + BoundaryOrder.UNORDERED::eq, + BoundaryOrder.ASCENDING::eq, + singleComparator); + validateOperator("Mismatching page indexes for value " + i + " with DESCENDING order", + BoundaryOrder.UNORDERED::eq, + BoundaryOrder.DESCENDING::eq, + singleComparator); + } + ExecStats stats = new ExecStats(); + for (int i = RAND_FROM - 1; i <= RAND_TO + 1; ++i) { + stats.add(validateOperator("Mismatching page indexes for value " + i + " with ASCENDING order", + BoundaryOrder.UNORDERED::eq, + BoundaryOrder.ASCENDING::eq, + RAND_ASCENDING.createValueComparator(i))); + stats.add(validateOperator("Mismatching page indexes for value " + i + " with DESCENDING order", + BoundaryOrder.UNORDERED::eq, + BoundaryOrder.DESCENDING::eq, + RAND_DESCENDING.createValueComparator(i))); + } + LOGGER.info("Executed eq on random data (page count: {}, values searched: {}):\n{}", RAND_COUNT, + RAND_TO - RAND_FROM + 2, stats); + } + + @Test + public void testGt() { + for (int i = FROM - 1; i <= TO + 1; ++i) { + validateOperator("Mismatching page indexes for value " + i + " with ASCENDING order", + BoundaryOrder.UNORDERED::gt, + BoundaryOrder.ASCENDING::gt, + ASCENDING.createValueComparator(i)); + validateOperator("Mismatching page indexes for value " + i + " with DESCENDING order", + BoundaryOrder.UNORDERED::gt, + BoundaryOrder.DESCENDING::gt, + DESCENDING.createValueComparator(i)); + } + for (int i = SINGLE_FROM - 1; i <= SINGLE_TO + 1; ++i) { + ColumnIndexBase.ValueComparator singleComparator = SINGLE.createValueComparator(i); + validateOperator("Mismatching page indexes for value " + i + " with ASCENDING order", + BoundaryOrder.UNORDERED::gt, + BoundaryOrder.ASCENDING::gt, + singleComparator); + validateOperator("Mismatching page indexes for value " + i + " with DESCENDING order", + BoundaryOrder.UNORDERED::gt, + BoundaryOrder.DESCENDING::gt, + singleComparator); + } + ExecStats stats = new ExecStats(); + for (int i = RAND_FROM - 1; i <= RAND_TO + 1; ++i) { + stats.add(validateOperator("Mismatching page indexes for value " + i + " with ASCENDING order", + BoundaryOrder.UNORDERED::gt, + BoundaryOrder.ASCENDING::gt, + RAND_ASCENDING.createValueComparator(i))); + stats.add(validateOperator("Mismatching page indexes for value " + i + " with DESCENDING order", + BoundaryOrder.UNORDERED::gt, + BoundaryOrder.DESCENDING::gt, + RAND_DESCENDING.createValueComparator(i))); + } + LOGGER.info("Executed gt on random data (page count: {}, values searched: {}):\n{}", RAND_COUNT, + RAND_TO - RAND_FROM + 2, stats); + } + + @Test + public void testGtEq() { + for (int i = FROM - 1; i <= TO + 1; ++i) { + validateOperator("Mismatching page indexes for value " + i + " with ASCENDING order", + BoundaryOrder.UNORDERED::gtEq, + BoundaryOrder.ASCENDING::gtEq, + ASCENDING.createValueComparator(i)); + validateOperator("Mismatching page indexes for value " + i + " with DESCENDING order", + BoundaryOrder.UNORDERED::gtEq, + BoundaryOrder.DESCENDING::gtEq, + DESCENDING.createValueComparator(i)); + } + for (int i = SINGLE_FROM - 1; i <= SINGLE_TO + 1; ++i) { + ColumnIndexBase.ValueComparator singleComparator = SINGLE.createValueComparator(i); + validateOperator("Mismatching page indexes for value " + i + " with ASCENDING order", + BoundaryOrder.UNORDERED::gtEq, + BoundaryOrder.ASCENDING::gtEq, + singleComparator); + validateOperator("Mismatching page indexes for value " + i + " with DESCENDING order", + BoundaryOrder.UNORDERED::gtEq, + BoundaryOrder.DESCENDING::gtEq, + singleComparator); + } + ExecStats stats = new ExecStats(); + for (int i = RAND_FROM - 1; i <= RAND_TO + 1; ++i) { + stats.add(validateOperator("Mismatching page indexes for value " + i + " with ASCENDING order", + BoundaryOrder.UNORDERED::gtEq, + BoundaryOrder.ASCENDING::gtEq, + RAND_ASCENDING.createValueComparator(i))); + stats.add(validateOperator("Mismatching page indexes for value " + i + " with DESCENDING order", + BoundaryOrder.UNORDERED::gtEq, + BoundaryOrder.DESCENDING::gtEq, + RAND_DESCENDING.createValueComparator(i))); + } + LOGGER.info("Executed gtEq on random data (page count: {}, values searched: {}):\n{}", RAND_COUNT, + RAND_TO - RAND_FROM + 2, stats); + } + + @Test + public void testLt() { + for (int i = FROM - 1; i <= TO + 1; ++i) { + validateOperator("Mismatching page indexes for value " + i + " with ASCENDING order", + BoundaryOrder.UNORDERED::lt, + BoundaryOrder.ASCENDING::lt, + ASCENDING.createValueComparator(i)); + validateOperator("Mismatching page indexes for value " + i + " with DESCENDING order", + BoundaryOrder.UNORDERED::lt, + BoundaryOrder.DESCENDING::lt, + DESCENDING.createValueComparator(i)); + } + for (int i = SINGLE_FROM - 1; i <= SINGLE_TO + 1; ++i) { + ColumnIndexBase.ValueComparator singleComparator = SINGLE.createValueComparator(i); + validateOperator("Mismatching page indexes for value " + i + " with ASCENDING order", + BoundaryOrder.UNORDERED::lt, + BoundaryOrder.ASCENDING::lt, + singleComparator); + validateOperator("Mismatching page indexes for value " + i + " with DESCENDING order", + BoundaryOrder.UNORDERED::lt, + BoundaryOrder.DESCENDING::lt, + singleComparator); + } + ExecStats stats = new ExecStats(); + for (int i = RAND_FROM - 1; i <= RAND_TO + 1; ++i) { + stats.add(validateOperator("Mismatching page indexes for value " + i + " with ASCENDING order", + BoundaryOrder.UNORDERED::lt, + BoundaryOrder.ASCENDING::lt, + RAND_ASCENDING.createValueComparator(i))); + stats.add(validateOperator("Mismatching page indexes for value " + i + " with DESCENDING order", + BoundaryOrder.UNORDERED::lt, + BoundaryOrder.DESCENDING::lt, + RAND_DESCENDING.createValueComparator(i))); + } + LOGGER.info("Executed lt on random data (page count: {}, values searched: {}):\n{}", RAND_COUNT, + RAND_TO - RAND_FROM + 2, stats); + } + + @Test + public void testLtEq() { + for (int i = FROM - 1; i <= TO + 1; ++i) { + validateOperator("Mismatching page indexes for value " + i + " with ASCENDING order", + BoundaryOrder.UNORDERED::ltEq, + BoundaryOrder.ASCENDING::ltEq, + ASCENDING.createValueComparator(i)); + validateOperator("Mismatching page indexes for value " + i + " with DESCENDING order", + BoundaryOrder.UNORDERED::ltEq, + BoundaryOrder.DESCENDING::ltEq, + DESCENDING.createValueComparator(i)); + } + for (int i = SINGLE_FROM - 1; i <= SINGLE_TO + 1; ++i) { + ColumnIndexBase.ValueComparator singleComparator = SINGLE.createValueComparator(i); + validateOperator("Mismatching page indexes for value " + i + " with ASCENDING order", + BoundaryOrder.UNORDERED::ltEq, + BoundaryOrder.ASCENDING::ltEq, + singleComparator); + validateOperator("Mismatching page indexes for value " + i + " with DESCENDING order", + BoundaryOrder.UNORDERED::ltEq, + BoundaryOrder.DESCENDING::ltEq, + singleComparator); + } + ExecStats stats = new ExecStats(); + for (int i = RAND_FROM - 1; i <= RAND_TO + 1; ++i) { + stats.add(validateOperator("Mismatching page indexes for value " + i + " with ASCENDING order", + BoundaryOrder.UNORDERED::ltEq, + BoundaryOrder.ASCENDING::ltEq, + RAND_ASCENDING.createValueComparator(i))); + stats.add(validateOperator("Mismatching page indexes for value " + i + " with DESCENDING order", + BoundaryOrder.UNORDERED::ltEq, + BoundaryOrder.DESCENDING::ltEq, + RAND_DESCENDING.createValueComparator(i))); + } + LOGGER.info("Executed ltEq on random data (page count: {}, values searched: {}):\n{}", RAND_COUNT, + RAND_TO - RAND_FROM + 2, stats); + } + + @Test + public void testNotEq() { + for (int i = -16; i <= 16; ++i) { + validateOperator("Mismatching page indexes for value " + i + " with ASCENDING order", + BoundaryOrder.UNORDERED::notEq, + BoundaryOrder.ASCENDING::notEq, + ASCENDING.createValueComparator(i)); + validateOperator("Mismatching page indexes for value " + i + " with DESCENDING order", + BoundaryOrder.UNORDERED::notEq, + BoundaryOrder.DESCENDING::notEq, + DESCENDING.createValueComparator(i)); + } + for (int i = FROM - 1; i <= TO + 1; ++i) { + ColumnIndexBase.ValueComparator singleComparator = SINGLE.createValueComparator(i); + validateOperator("Mismatching page indexes for value " + i + " with ASCENDING order", + BoundaryOrder.UNORDERED::notEq, + BoundaryOrder.ASCENDING::notEq, + singleComparator); + validateOperator("Mismatching page indexes for value " + i + " with DESCENDING order", + BoundaryOrder.UNORDERED::notEq, + BoundaryOrder.DESCENDING::notEq, + singleComparator); + } + ExecStats stats = new ExecStats(); + for (int i = RAND_FROM - 1; i <= RAND_TO + 1; ++i) { + stats.add(validateOperator("Mismatching page indexes for value " + i + " with ASCENDING order", + BoundaryOrder.UNORDERED::notEq, + BoundaryOrder.ASCENDING::notEq, + RAND_ASCENDING.createValueComparator(i))); + stats.add(validateOperator("Mismatching page indexes for value " + i + " with DESCENDING order", + BoundaryOrder.UNORDERED::notEq, + BoundaryOrder.DESCENDING::notEq, + RAND_DESCENDING.createValueComparator(i))); + } + LOGGER.info("Executed notEq on random data (page count: {}, values searched: {}):\n{}", RAND_COUNT, + RAND_TO - RAND_FROM + 2, stats); + } + +} diff --git a/parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestColumnIndexBuilder.java b/parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestColumnIndexBuilder.java new file mode 100644 index 0000000000..5a3947c980 --- /dev/null +++ b/parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestColumnIndexBuilder.java @@ -0,0 +1,1546 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.internal.column.columnindex; + +import static java.util.Arrays.asList; +import static org.apache.parquet.filter2.predicate.FilterApi.binaryColumn; +import static org.apache.parquet.filter2.predicate.FilterApi.booleanColumn; +import static org.apache.parquet.filter2.predicate.FilterApi.doubleColumn; +import static org.apache.parquet.filter2.predicate.FilterApi.eq; +import static org.apache.parquet.filter2.predicate.FilterApi.floatColumn; +import static org.apache.parquet.filter2.predicate.FilterApi.gt; +import static org.apache.parquet.filter2.predicate.FilterApi.gtEq; +import static org.apache.parquet.filter2.predicate.FilterApi.intColumn; +import static org.apache.parquet.filter2.predicate.FilterApi.longColumn; +import static org.apache.parquet.filter2.predicate.FilterApi.lt; +import static org.apache.parquet.filter2.predicate.FilterApi.ltEq; +import static org.apache.parquet.filter2.predicate.FilterApi.notEq; +import static org.apache.parquet.filter2.predicate.FilterApi.userDefined; +import static org.apache.parquet.filter2.predicate.LogicalInverter.invert; +import static org.apache.parquet.schema.OriginalType.DECIMAL; +import static org.apache.parquet.schema.OriginalType.UINT_8; +import static org.apache.parquet.schema.OriginalType.UTF8; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BOOLEAN; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.DOUBLE; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FLOAT; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT64; +import static org.hamcrest.CoreMatchers.instanceOf; +import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertThat; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +import java.math.BigDecimal; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; + +import org.apache.parquet.bytes.BytesUtils; +import org.apache.parquet.column.statistics.Statistics; +import org.apache.parquet.filter2.predicate.FilterPredicate; +import org.apache.parquet.filter2.predicate.Operators.BinaryColumn; +import org.apache.parquet.filter2.predicate.Operators.BooleanColumn; +import org.apache.parquet.filter2.predicate.Operators.DoubleColumn; +import org.apache.parquet.filter2.predicate.Operators.FloatColumn; +import org.apache.parquet.filter2.predicate.Operators.IntColumn; +import org.apache.parquet.filter2.predicate.Operators.LongColumn; +import org.apache.parquet.filter2.predicate.UserDefinedPredicate; +import org.apache.parquet.io.api.Binary; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Types; +import org.junit.Test; + +/** + * Tests for {@link ColumnIndexBuilder}. + */ +public class TestColumnIndexBuilder { + + public static class BinaryDecimalIsNullOrZeroUdp extends UserDefinedPredicate { + private static final Binary ZERO = decimalBinary("0.0"); + + @Override + public boolean keep(Binary value) { + return value == null || value.equals(ZERO); + } + + @Override + public boolean canDrop(org.apache.parquet.filter2.predicate.Statistics statistics) { + Comparator cmp = statistics.getComparator(); + return cmp.compare(statistics.getMin(), ZERO) > 0 || cmp.compare(statistics.getMax(), ZERO) < 0; + } + + @Override + public boolean inverseCanDrop(org.apache.parquet.filter2.predicate.Statistics statistics) { + Comparator cmp = statistics.getComparator(); + return cmp.compare(statistics.getMin(), ZERO) == 0 && cmp.compare(statistics.getMax(), ZERO) == 0; + } + } + + public static class BinaryUtf8StartsWithB extends UserDefinedPredicate { + private static final Binary B = stringBinary("B"); + private static final Binary C = stringBinary("C"); + + @Override + public boolean keep(Binary value) { + return value != null && value.length() > 0 && value.getBytesUnsafe()[0] == 'B'; + } + + @Override + public boolean canDrop(org.apache.parquet.filter2.predicate.Statistics statistics) { + Comparator cmp = statistics.getComparator(); + return cmp.compare(statistics.getMin(), C) >= 0 || cmp.compare(statistics.getMax(), B) < 0; + } + + @Override + public boolean inverseCanDrop(org.apache.parquet.filter2.predicate.Statistics statistics) { + Comparator cmp = statistics.getComparator(); + return cmp.compare(statistics.getMin(), B) >= 0 && cmp.compare(statistics.getMax(), C) < 0; + } + } + + public static class BooleanIsTrueOrNull extends UserDefinedPredicate { + @Override + public boolean keep(Boolean value) { + return value == null || value; + } + + @Override + public boolean canDrop(org.apache.parquet.filter2.predicate.Statistics statistics) { + return statistics.getComparator().compare(statistics.getMax(), true) != 0; + } + + @Override + public boolean inverseCanDrop(org.apache.parquet.filter2.predicate.Statistics statistics) { + return statistics.getComparator().compare(statistics.getMin(), true) == 0; + } + } + + public static class DoubleIsInteger extends UserDefinedPredicate { + @Override + public boolean keep(Double value) { + return value != null && Math.floor(value) == value; + } + + @Override + public boolean canDrop(org.apache.parquet.filter2.predicate.Statistics statistics) { + double min = statistics.getMin(); + double max = statistics.getMax(); + Comparator cmp = statistics.getComparator(); + return cmp.compare(Math.floor(min), Math.floor(max)) == 0 && cmp.compare(Math.floor(min), min) != 0 + && cmp.compare(Math.floor(max), max) != 0; + } + + @Override + public boolean inverseCanDrop(org.apache.parquet.filter2.predicate.Statistics statistics) { + double min = statistics.getMin(); + double max = statistics.getMax(); + Comparator cmp = statistics.getComparator(); + return cmp.compare(min, max) == 0 && cmp.compare(Math.floor(min), min) == 0; + } + } + + public static class FloatIsInteger extends UserDefinedPredicate { + private static float floor(float value) { + return (float) Math.floor(value); + } + + @Override + public boolean keep(Float value) { + return value != null && Math.floor(value) == value; + } + + @Override + public boolean canDrop(org.apache.parquet.filter2.predicate.Statistics statistics) { + float min = statistics.getMin(); + float max = statistics.getMax(); + Comparator cmp = statistics.getComparator(); + return cmp.compare(floor(min), floor(max)) == 0 && cmp.compare(floor(min), min) != 0 + && cmp.compare(floor(max), max) != 0; + } + + @Override + public boolean inverseCanDrop(org.apache.parquet.filter2.predicate.Statistics statistics) { + float min = statistics.getMin(); + float max = statistics.getMax(); + Comparator cmp = statistics.getComparator(); + return cmp.compare(min, max) == 0 && cmp.compare(floor(min), min) == 0; + } + } + + public static class IntegerIsDivisableWith3 extends UserDefinedPredicate { + @Override + public boolean keep(Integer value) { + return value != null && value % 3 == 0; + } + + @Override + public boolean canDrop(org.apache.parquet.filter2.predicate.Statistics statistics) { + int min = statistics.getMin(); + int max = statistics.getMax(); + return min % 3 != 0 && max % 3 != 0 && max - min < 3; + } + + @Override + public boolean inverseCanDrop(org.apache.parquet.filter2.predicate.Statistics statistics) { + int min = statistics.getMin(); + int max = statistics.getMax(); + return min == max && min % 3 == 0; + } + } + + public static class LongIsDivisableWith3 extends UserDefinedPredicate { + @Override + public boolean keep(Long value) { + return value != null && value % 3 == 0; + } + + @Override + public boolean canDrop(org.apache.parquet.filter2.predicate.Statistics statistics) { + long min = statistics.getMin(); + long max = statistics.getMax(); + return min % 3 != 0 && max % 3 != 0 && max - min < 3; + } + + @Override + public boolean inverseCanDrop(org.apache.parquet.filter2.predicate.Statistics statistics) { + long min = statistics.getMin(); + long max = statistics.getMax(); + return min == max && min % 3 == 0; + } + } + + @Test + public void testBuildBinaryDecimal() { + PrimitiveType type = Types.required(BINARY).as(DECIMAL).precision(12).scale(2).named("test_binary_decimal"); + ColumnIndexBuilder builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE); + assertThat(builder, instanceOf(BinaryColumnIndexBuilder.class)); + assertNull(builder.build()); + BinaryColumn col = binaryColumn("test_col"); + + StatsBuilder sb = new StatsBuilder(); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, decimalBinary("-0.17"), decimalBinary("1234567890.12"))); + builder.add(sb.stats(type, decimalBinary("-234.23"), null, null, null)); + builder.add(sb.stats(type, null, null, null)); + builder.add(sb.stats(type, decimalBinary("-9999293.23"), decimalBinary("2348978.45"))); + builder.add(sb.stats(type, null, null, null, null)); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, decimalBinary("87656273"))); + assertEquals(8, builder.getPageCount()); + assertEquals(sb.getMinMaxSize(), builder.getMinMaxSize()); + ColumnIndex columnIndex = builder.build(); + assertEquals(BoundaryOrder.UNORDERED, columnIndex.getBoundaryOrder()); + assertCorrectNullCounts(columnIndex, 2, 0, 3, 3, 0, 4, 2, 0); + assertCorrectNullPages(columnIndex, true, false, false, true, false, true, true, false); + assertCorrectValues(columnIndex.getMaxValues(), + null, + decimalBinary("1234567890.12"), + decimalBinary("-234.23"), + null, + decimalBinary("2348978.45"), + null, + null, + decimalBinary("87656273")); + assertCorrectValues(columnIndex.getMinValues(), + null, + decimalBinary("-0.17"), + decimalBinary("-234.23"), + null, + decimalBinary("-9999293.23"), + null, + null, + decimalBinary("87656273")); + assertCorrectFiltering(columnIndex, eq(col, decimalBinary("0.0")), 1, 4); + assertCorrectFiltering(columnIndex, eq(col, null), 0, 2, 3, 5, 6); + assertCorrectFiltering(columnIndex, notEq(col, decimalBinary("87656273")), 0, 1, 2, 3, 4, 5, 6); + assertCorrectFiltering(columnIndex, notEq(col, null), 1, 2, 4, 7); + assertCorrectFiltering(columnIndex, gt(col, decimalBinary("2348978.45")), 1); + assertCorrectFiltering(columnIndex, gtEq(col, decimalBinary("2348978.45")), 1, 4); + assertCorrectFiltering(columnIndex, lt(col, decimalBinary("-234.23")), 4); + assertCorrectFiltering(columnIndex, ltEq(col, decimalBinary("-234.23")), 2, 4); + assertCorrectFiltering(columnIndex, userDefined(col, BinaryDecimalIsNullOrZeroUdp.class), 0, 1, 2, 3, 4, 5, 6); + assertCorrectFiltering(columnIndex, invert(userDefined(col, BinaryDecimalIsNullOrZeroUdp.class)), 1, 2, 4, 7); + + builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE); + sb = new StatsBuilder(); + builder.add(sb.stats(type, null, null, null, null)); + builder.add(sb.stats(type, decimalBinary("-9999293.23"), decimalBinary("-234.23"))); + builder.add(sb.stats(type, decimalBinary("-0.17"), decimalBinary("87656273"))); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, decimalBinary("87656273"))); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, decimalBinary("1234567890.12"), null, null, null)); + builder.add(sb.stats(type, null, null, null)); + assertEquals(8, builder.getPageCount()); + assertEquals(sb.getMinMaxSize(), builder.getMinMaxSize()); + columnIndex = builder.build(); + assertEquals(BoundaryOrder.ASCENDING, columnIndex.getBoundaryOrder()); + assertCorrectNullCounts(columnIndex, 4, 0, 0, 2, 0, 2, 3, 3); + assertCorrectNullPages(columnIndex, true, false, false, true, false, true, false, true); + assertCorrectValues(columnIndex.getMaxValues(), + null, + decimalBinary("-234.23"), + decimalBinary("87656273"), + null, + decimalBinary("87656273"), + null, + decimalBinary("1234567890.12"), + null); + assertCorrectValues(columnIndex.getMinValues(), + null, + decimalBinary("-9999293.23"), + decimalBinary("-0.17"), + null, + decimalBinary("87656273"), + null, + decimalBinary("1234567890.12"), + null); + assertCorrectFiltering(columnIndex, eq(col, decimalBinary("87656273")), 2, 4); + assertCorrectFiltering(columnIndex, eq(col, null), 0, 3, 5, 6, 7); + assertCorrectFiltering(columnIndex, notEq(col, decimalBinary("87656273")), 0, 1, 2, 3, 5, 6, 7); + assertCorrectFiltering(columnIndex, notEq(col, null), 1, 2, 4, 6); + assertCorrectFiltering(columnIndex, gt(col, decimalBinary("87656273")), 6); + assertCorrectFiltering(columnIndex, gtEq(col, decimalBinary("87656273")), 2, 4, 6); + assertCorrectFiltering(columnIndex, lt(col, decimalBinary("-0.17")), 1); + assertCorrectFiltering(columnIndex, ltEq(col, decimalBinary("-0.17")), 1, 2); + assertCorrectFiltering(columnIndex, userDefined(col, BinaryDecimalIsNullOrZeroUdp.class), 0, 2, 3, 5, 6, 7); + assertCorrectFiltering(columnIndex, invert(userDefined(col, BinaryDecimalIsNullOrZeroUdp.class)), 1, 2, 4, 6); + + builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE); + sb = new StatsBuilder(); + builder.add(sb.stats(type, null, null, null)); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, decimalBinary("1234567890.12"), null, null, null)); + builder.add(sb.stats(type, null, null, null, null)); + builder.add(sb.stats(type, decimalBinary("1234567890.12"), decimalBinary("87656273"))); + builder.add(sb.stats(type, decimalBinary("987656273"), decimalBinary("-0.17"))); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, decimalBinary("-234.23"), decimalBinary("-9999293.23"))); + assertEquals(8, builder.getPageCount()); + assertEquals(sb.getMinMaxSize(), builder.getMinMaxSize()); + columnIndex = builder.build(); + assertEquals(BoundaryOrder.DESCENDING, columnIndex.getBoundaryOrder()); + assertCorrectNullCounts(columnIndex, 3, 2, 3, 4, 0, 0, 2, 0); + assertCorrectNullPages(columnIndex, true, true, false, true, false, false, true, false); + assertCorrectValues(columnIndex.getMaxValues(), + null, + null, + decimalBinary("1234567890.12"), + null, + decimalBinary("1234567890.12"), + decimalBinary("987656273"), + null, + decimalBinary("-234.23")); + assertCorrectValues(columnIndex.getMinValues(), + null, + null, + decimalBinary("1234567890.12"), + null, + decimalBinary("87656273"), + decimalBinary("-0.17"), + null, + decimalBinary("-9999293.23")); + assertCorrectFiltering(columnIndex, eq(col, decimalBinary("1234567890.12")), 2, 4); + assertCorrectFiltering(columnIndex, eq(col, null), 0, 1, 2, 3, 6); + assertCorrectFiltering(columnIndex, notEq(col, decimalBinary("0.0")), 0, 1, 2, 3, 4, 5, 6, 7); + assertCorrectFiltering(columnIndex, notEq(col, null), 2, 4, 5, 7); + assertCorrectFiltering(columnIndex, gt(col, decimalBinary("1234567890.12"))); + assertCorrectFiltering(columnIndex, gtEq(col, decimalBinary("1234567890.12")), 2, 4); + assertCorrectFiltering(columnIndex, lt(col, decimalBinary("-0.17")), 7); + assertCorrectFiltering(columnIndex, ltEq(col, decimalBinary("-0.17")), 5, 7); + assertCorrectFiltering(columnIndex, userDefined(col, BinaryDecimalIsNullOrZeroUdp.class), 0, 1, 2, 3, 5, 6); + assertCorrectFiltering(columnIndex, invert(userDefined(col, BinaryDecimalIsNullOrZeroUdp.class)), 2, 4, 5, 7); + } + + @Test + public void testBuildBinaryUtf8() { + PrimitiveType type = Types.required(BINARY).as(UTF8).named("test_binary_utf8"); + ColumnIndexBuilder builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE); + assertThat(builder, instanceOf(BinaryColumnIndexBuilder.class)); + assertNull(builder.build()); + BinaryColumn col = binaryColumn("test_col"); + + StatsBuilder sb = new StatsBuilder(); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, stringBinary("Jeltz"), stringBinary("Slartibartfast"), null, null)); + builder.add(sb.stats(type, null, null, null, null, null)); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, stringBinary("Beeblebrox"), stringBinary("Prefect"))); + builder.add(sb.stats(type, stringBinary("Dent"), stringBinary("Trilian"), null)); + builder.add(sb.stats(type, stringBinary("Beeblebrox"))); + builder.add(sb.stats(type, null, null)); + assertEquals(8, builder.getPageCount()); + assertEquals(sb.getMinMaxSize(), builder.getMinMaxSize()); + ColumnIndex columnIndex = builder.build(); + assertEquals(BoundaryOrder.UNORDERED, columnIndex.getBoundaryOrder()); + assertCorrectNullCounts(columnIndex, 2, 2, 5, 2, 0, 1, 0, 2); + assertCorrectNullPages(columnIndex, true, false, true, true, false, false, false, true); + assertCorrectValues(columnIndex.getMaxValues(), + null, + stringBinary("Slartibartfast"), + null, + null, + stringBinary("Prefect"), + stringBinary("Trilian"), + stringBinary("Beeblebrox"), + null); + assertCorrectValues(columnIndex.getMinValues(), + null, + stringBinary("Jeltz"), + null, + null, + stringBinary("Beeblebrox"), + stringBinary("Dent"), + stringBinary("Beeblebrox"), + null); + assertCorrectFiltering(columnIndex, eq(col, stringBinary("Marvin")), 1, 4, 5); + assertCorrectFiltering(columnIndex, eq(col, null), 0, 1, 2, 3, 5, 7); + assertCorrectFiltering(columnIndex, notEq(col, stringBinary("Beeblebrox")), 0, 1, 2, 3, 4, 5, 7); + assertCorrectFiltering(columnIndex, notEq(col, null), 1, 4, 5, 6); + assertCorrectFiltering(columnIndex, gt(col, stringBinary("Prefect")), 1, 5); + assertCorrectFiltering(columnIndex, gtEq(col, stringBinary("Prefect")), 1, 4, 5); + assertCorrectFiltering(columnIndex, lt(col, stringBinary("Dent")), 4, 6); + assertCorrectFiltering(columnIndex, ltEq(col, stringBinary("Dent")), 4, 5, 6); + assertCorrectFiltering(columnIndex, userDefined(col, BinaryUtf8StartsWithB.class), 4, 6); + assertCorrectFiltering(columnIndex, invert(userDefined(col, BinaryUtf8StartsWithB.class)), 0, 1, 2, 3, 4, 5, 7); + + builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE); + sb = new StatsBuilder(); + builder.add(sb.stats(type, stringBinary("Beeblebrox"), stringBinary("Dent"), null, null)); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, null, null, null, null, null)); + builder.add(sb.stats(type, stringBinary("Dent"), stringBinary("Jeltz"))); + builder.add(sb.stats(type, stringBinary("Dent"), stringBinary("Prefect"), null)); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, stringBinary("Slartibartfast"))); + builder.add(sb.stats(type, null, null)); + assertEquals(8, builder.getPageCount()); + assertEquals(sb.getMinMaxSize(), builder.getMinMaxSize()); + columnIndex = builder.build(); + assertEquals(BoundaryOrder.ASCENDING, columnIndex.getBoundaryOrder()); + assertCorrectNullCounts(columnIndex, 2, 2, 5, 0, 1, 2, 0, 2); + assertCorrectNullPages(columnIndex, false, true, true, false, false, true, false, true); + assertCorrectValues(columnIndex.getMaxValues(), + stringBinary("Dent"), + null, + null, + stringBinary("Jeltz"), + stringBinary("Prefect"), + null, + stringBinary("Slartibartfast"), + null); + assertCorrectValues(columnIndex.getMinValues(), + stringBinary("Beeblebrox"), + null, + null, + stringBinary("Dent"), + stringBinary("Dent"), + null, + stringBinary("Slartibartfast"), + null); + assertCorrectFiltering(columnIndex, eq(col, stringBinary("Jeltz")), 3, 4); + assertCorrectFiltering(columnIndex, eq(col, null), 0, 1, 2, 4, 5, 7); + assertCorrectFiltering(columnIndex, notEq(col, stringBinary("Slartibartfast")), 0, 1, 2, 3, 4, 5, 7); + assertCorrectFiltering(columnIndex, notEq(col, null), 0, 3, 4, 6); + assertCorrectFiltering(columnIndex, gt(col, stringBinary("Marvin")), 4, 6); + assertCorrectFiltering(columnIndex, gtEq(col, stringBinary("Marvin")), 4, 6); + assertCorrectFiltering(columnIndex, lt(col, stringBinary("Dent")), 0); + assertCorrectFiltering(columnIndex, ltEq(col, stringBinary("Dent")), 0, 3, 4); + assertCorrectFiltering(columnIndex, userDefined(col, BinaryUtf8StartsWithB.class), 0); + assertCorrectFiltering(columnIndex, invert(userDefined(col, BinaryUtf8StartsWithB.class)), 0, 1, 2, 3, 4, 5, 6, 7); + + builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE); + sb = new StatsBuilder(); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, stringBinary("Slartibartfast"))); + builder.add(sb.stats(type, null, null, null, null, null)); + builder.add(sb.stats(type, stringBinary("Prefect"), stringBinary("Jeltz"), null)); + builder.add(sb.stats(type, stringBinary("Dent"), stringBinary("Dent"))); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, stringBinary("Dent"), stringBinary("Beeblebrox"), null, null)); + assertEquals(8, builder.getPageCount()); + assertEquals(sb.getMinMaxSize(), builder.getMinMaxSize()); + columnIndex = builder.build(); + assertEquals(BoundaryOrder.DESCENDING, columnIndex.getBoundaryOrder()); + assertCorrectNullCounts(columnIndex, 2, 0, 5, 1, 0, 2, 2, 2); + assertCorrectNullPages(columnIndex, true, false, true, false, false, true, true, false); + assertCorrectValues(columnIndex.getMaxValues(), + null, + stringBinary("Slartibartfast"), + null, + stringBinary("Prefect"), + stringBinary("Dent"), + null, + null, + stringBinary("Dent")); + assertCorrectValues(columnIndex.getMinValues(), + null, + stringBinary("Slartibartfast"), + null, + stringBinary("Jeltz"), + stringBinary("Dent"), + null, + null, + stringBinary("Beeblebrox")); + assertCorrectFiltering(columnIndex, eq(col, stringBinary("Marvin")), 3); + assertCorrectFiltering(columnIndex, eq(col, null), 0, 2, 3, 5, 6, 7); + assertCorrectFiltering(columnIndex, notEq(col, stringBinary("Dent")), 0, 1, 2, 3, 5, 6, 7); + assertCorrectFiltering(columnIndex, notEq(col, null), 1, 3, 4, 7); + assertCorrectFiltering(columnIndex, gt(col, stringBinary("Prefect")), 1); + assertCorrectFiltering(columnIndex, gtEq(col, stringBinary("Prefect")), 1, 3); + assertCorrectFiltering(columnIndex, lt(col, stringBinary("Marvin")), 3, 4, 7); + assertCorrectFiltering(columnIndex, ltEq(col, stringBinary("Marvin")), 3, 4, 7); + assertCorrectFiltering(columnIndex, userDefined(col, BinaryUtf8StartsWithB.class), 7); + assertCorrectFiltering(columnIndex, invert(userDefined(col, BinaryUtf8StartsWithB.class)), 0, 1, 2, 3, 4, 5, 6, 7); + } + + @Test + public void testStaticBuildBinary() { + ColumnIndex columnIndex = ColumnIndexBuilder.build( + Types.required(BINARY).as(UTF8).named("test_binary_utf8"), + BoundaryOrder.ASCENDING, + asList(true, true, false, false, true, false, true, false), + asList(1l, 2l, 3l, 4l, 5l, 6l, 7l, 8l), + toBBList( + null, + null, + stringBinary("Beeblebrox"), + stringBinary("Dent"), + null, + stringBinary("Jeltz"), + null, + stringBinary("Slartibartfast")), + toBBList( + null, + null, + stringBinary("Dent"), + stringBinary("Dent"), + null, + stringBinary("Prefect"), + null, + stringBinary("Slartibartfast"))); + assertEquals(BoundaryOrder.ASCENDING, columnIndex.getBoundaryOrder()); + assertCorrectNullCounts(columnIndex, 1, 2, 3, 4, 5, 6, 7, 8); + assertCorrectNullPages(columnIndex, true, true, false, false, true, false, true, false); + assertCorrectValues(columnIndex.getMaxValues(), + null, + null, + stringBinary("Dent"), + stringBinary("Dent"), + null, + stringBinary("Prefect"), + null, + stringBinary("Slartibartfast")); + assertCorrectValues(columnIndex.getMinValues(), + null, + null, + stringBinary("Beeblebrox"), + stringBinary("Dent"), + null, + stringBinary("Jeltz"), + null, + stringBinary("Slartibartfast")); + } + + @Test + public void testFilterWithoutNullCounts() { + ColumnIndex columnIndex = ColumnIndexBuilder.build( + Types.required(BINARY).as(UTF8).named("test_binary_utf8"), + BoundaryOrder.ASCENDING, + asList(true, true, false, false, true, false, true, false), + null, + toBBList( + null, + null, + stringBinary("Beeblebrox"), + stringBinary("Dent"), + null, + stringBinary("Jeltz"), + null, + stringBinary("Slartibartfast")), + toBBList( + null, + null, + stringBinary("Dent"), + stringBinary("Dent"), + null, + stringBinary("Prefect"), + null, + stringBinary("Slartibartfast"))); + assertEquals(BoundaryOrder.ASCENDING, columnIndex.getBoundaryOrder()); + assertNull(columnIndex.getNullCounts()); + assertCorrectNullPages(columnIndex, true, true, false, false, true, false, true, false); + assertCorrectValues(columnIndex.getMaxValues(), + null, + null, + stringBinary("Dent"), + stringBinary("Dent"), + null, + stringBinary("Prefect"), + null, + stringBinary("Slartibartfast")); + assertCorrectValues(columnIndex.getMinValues(), + null, + null, + stringBinary("Beeblebrox"), + stringBinary("Dent"), + null, + stringBinary("Jeltz"), + null, + stringBinary("Slartibartfast")); + + BinaryColumn col = binaryColumn("test_col"); + assertCorrectFiltering(columnIndex, eq(col, stringBinary("Dent")), 2, 3); + assertCorrectFiltering(columnIndex, eq(col, null), 0, 1, 2, 3, 4, 5, 6, 7); + assertCorrectFiltering(columnIndex, notEq(col, stringBinary("Dent")), 0, 1, 2, 3, 4, 5, 6, 7); + assertCorrectFiltering(columnIndex, notEq(col, null), 2, 3, 5, 7); + assertCorrectFiltering(columnIndex, userDefined(col, BinaryDecimalIsNullOrZeroUdp.class), 0, 1, 2, 3, 4, 5, 6, 7); + assertCorrectFiltering(columnIndex, invert(userDefined(col, BinaryDecimalIsNullOrZeroUdp.class)), 2, 3, 5, 7); + } + + @Test + public void testBuildBoolean() { + PrimitiveType type = Types.required(BOOLEAN).named("test_boolean"); + ColumnIndexBuilder builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE); + assertThat(builder, instanceOf(BooleanColumnIndexBuilder.class)); + assertNull(builder.build()); + BooleanColumn col = booleanColumn("test_col"); + + builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE); + StatsBuilder sb = new StatsBuilder(); + builder.add(sb.stats(type, false, true)); + builder.add(sb.stats(type, true, false, null)); + builder.add(sb.stats(type, true, true, null, null)); + builder.add(sb.stats(type, null, null, null)); + builder.add(sb.stats(type, false, false)); + assertEquals(5, builder.getPageCount()); + assertEquals(sb.getMinMaxSize(), builder.getMinMaxSize()); + ColumnIndex columnIndex = builder.build(); + assertEquals(BoundaryOrder.UNORDERED, columnIndex.getBoundaryOrder()); + assertCorrectNullCounts(columnIndex, 0, 1, 2, 3, 0); + assertCorrectNullPages(columnIndex, false, false, false, true, false); + assertCorrectValues(columnIndex.getMaxValues(), true, true, true, null, false); + assertCorrectValues(columnIndex.getMinValues(), false, false, true, null, false); + assertCorrectFiltering(columnIndex, eq(col, true), 0, 1, 2); + assertCorrectFiltering(columnIndex, eq(col, null), 1, 2, 3); + assertCorrectFiltering(columnIndex, notEq(col, true), 0, 1, 2, 3, 4); + assertCorrectFiltering(columnIndex, notEq(col, null), 0, 1, 2, 4); + assertCorrectFiltering(columnIndex, userDefined(col, BooleanIsTrueOrNull.class), 0, 1, 2, 3); + assertCorrectFiltering(columnIndex, invert(userDefined(col, BooleanIsTrueOrNull.class)), 0, 1, 4); + + builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE); + sb = new StatsBuilder(); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, false, false)); + builder.add(sb.stats(type, null, null, null)); + builder.add(sb.stats(type, null, null, null, null)); + builder.add(sb.stats(type, false, true, null)); + builder.add(sb.stats(type, false, true, null, null)); + builder.add(sb.stats(type, null, null, null)); + assertEquals(7, builder.getPageCount()); + assertEquals(sb.getMinMaxSize(), builder.getMinMaxSize()); + columnIndex = builder.build(); + assertEquals(BoundaryOrder.ASCENDING, columnIndex.getBoundaryOrder()); + assertCorrectNullCounts(columnIndex, 2, 0, 3, 4, 1, 2, 3); + assertCorrectNullPages(columnIndex, true, false, true, true, false, false, true); + assertCorrectValues(columnIndex.getMaxValues(), null, false, null, null, true, true, null); + assertCorrectValues(columnIndex.getMinValues(), null, false, null, null, false, false, null); + assertCorrectFiltering(columnIndex, eq(col, true), 4, 5); + assertCorrectFiltering(columnIndex, eq(col, null), 0, 2, 3, 4, 5, 6); + assertCorrectFiltering(columnIndex, notEq(col, true), 0, 1, 2, 3, 4, 5, 6); + assertCorrectFiltering(columnIndex, notEq(col, null), 1, 4, 5); + assertCorrectFiltering(columnIndex, userDefined(col, BooleanIsTrueOrNull.class), 0, 2, 3, 4, 5, 6); + assertCorrectFiltering(columnIndex, invert(userDefined(col, BooleanIsTrueOrNull.class)), 1, 4, 5); + + builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE); + sb = new StatsBuilder(); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, true, true)); + builder.add(sb.stats(type, null, null, null)); + builder.add(sb.stats(type, null, null, null, null)); + builder.add(sb.stats(type, true, false, null)); + builder.add(sb.stats(type, false, false, null, null)); + builder.add(sb.stats(type, null, null, null)); + assertEquals(7, builder.getPageCount()); + assertEquals(sb.getMinMaxSize(), builder.getMinMaxSize()); + columnIndex = builder.build(); + assertEquals(BoundaryOrder.DESCENDING, columnIndex.getBoundaryOrder()); + assertCorrectNullCounts(columnIndex, 2, 0, 3, 4, 1, 2, 3); + assertCorrectNullPages(columnIndex, true, false, true, true, false, false, true); + assertCorrectValues(columnIndex.getMaxValues(), null, true, null, null, true, false, null); + assertCorrectValues(columnIndex.getMinValues(), null, true, null, null, false, false, null); + assertCorrectFiltering(columnIndex, eq(col, true), 1, 4); + assertCorrectFiltering(columnIndex, eq(col, null), 0, 2, 3, 4, 5, 6); + assertCorrectFiltering(columnIndex, notEq(col, true), 0, 2, 3, 4, 5, 6); + assertCorrectFiltering(columnIndex, notEq(col, null), 1, 4, 5); + assertCorrectFiltering(columnIndex, userDefined(col, BooleanIsTrueOrNull.class), 0, 1, 2, 3, 4, 5, 6); + assertCorrectFiltering(columnIndex, invert(userDefined(col, BooleanIsTrueOrNull.class)), 4, 5); + } + + @Test + public void testStaticBuildBoolean() { + ColumnIndex columnIndex = ColumnIndexBuilder.build( + Types.required(BOOLEAN).named("test_boolean"), + BoundaryOrder.DESCENDING, + asList(false, true, false, true, false, true), + asList(9l, 8l, 7l, 6l, 5l, 0l), + toBBList(false, null, false, null, true, null), + toBBList(true, null, false, null, true, null)); + assertEquals(BoundaryOrder.DESCENDING, columnIndex.getBoundaryOrder()); + assertCorrectNullCounts(columnIndex, 9, 8, 7, 6, 5, 0); + assertCorrectNullPages(columnIndex, false, true, false, true, false, true); + assertCorrectValues(columnIndex.getMaxValues(), true, null, false, null, true, null); + assertCorrectValues(columnIndex.getMinValues(), false, null, false, null, true, null); + } + + @Test + public void testBuildDouble() { + PrimitiveType type = Types.required(DOUBLE).named("test_double"); + ColumnIndexBuilder builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE); + assertThat(builder, instanceOf(DoubleColumnIndexBuilder.class)); + assertNull(builder.build()); + DoubleColumn col = doubleColumn("test_col"); + + StatsBuilder sb = new StatsBuilder(); + builder.add(sb.stats(type, -4.2, -4.1)); + builder.add(sb.stats(type, -11.7, 7.0, null)); + builder.add(sb.stats(type, 2.2, 2.2, null, null)); + builder.add(sb.stats(type, null, null, null)); + builder.add(sb.stats(type, 1.9, 2.32)); + builder.add(sb.stats(type, -21.0, 8.1)); + assertEquals(6, builder.getPageCount()); + assertEquals(sb.getMinMaxSize(), builder.getMinMaxSize()); + ColumnIndex columnIndex = builder.build(); + assertEquals(BoundaryOrder.UNORDERED, columnIndex.getBoundaryOrder()); + assertCorrectNullCounts(columnIndex, 0, 1, 2, 3, 0, 0); + assertCorrectNullPages(columnIndex, false, false, false, true, false, false); + assertCorrectValues(columnIndex.getMaxValues(), -4.1, 7.0, 2.2, null, 2.32, 8.1); + assertCorrectValues(columnIndex.getMinValues(), -4.2, -11.7, 2.2, null, 1.9, -21.0); + assertCorrectFiltering(columnIndex, eq(col, 0.0), 1, 5); + assertCorrectFiltering(columnIndex, eq(col, null), 1, 2, 3); + assertCorrectFiltering(columnIndex, notEq(col, 2.2), 0, 1, 2, 3, 4, 5); + assertCorrectFiltering(columnIndex, notEq(col, null), 0, 1, 2, 4, 5); + assertCorrectFiltering(columnIndex, gt(col, 2.2), 1, 4, 5); + assertCorrectFiltering(columnIndex, gtEq(col, 2.2), 1, 2, 4, 5); + assertCorrectFiltering(columnIndex, lt(col, -4.2), 1, 5); + assertCorrectFiltering(columnIndex, ltEq(col, -4.2), 0, 1, 5); + assertCorrectFiltering(columnIndex, userDefined(col, DoubleIsInteger.class), 1, 4, 5); + assertCorrectFiltering(columnIndex, invert(userDefined(col, DoubleIsInteger.class)), 0, 1, 2, 3, 4, 5); + + builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE); + sb = new StatsBuilder(); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, -532.3, -345.2, null, null)); + builder.add(sb.stats(type, -234.7, -234.6, null)); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, null, null, null)); + builder.add(sb.stats(type, -234.6, 2.99999)); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, 3.0, 42.83)); + builder.add(sb.stats(type, null, null)); + assertEquals(9, builder.getPageCount()); + assertEquals(sb.getMinMaxSize(), builder.getMinMaxSize()); + columnIndex = builder.build(); + assertEquals(BoundaryOrder.ASCENDING, columnIndex.getBoundaryOrder()); + assertCorrectNullCounts(columnIndex, 2, 2, 1, 2, 3, 0, 2, 0, 2); + assertCorrectNullPages(columnIndex, true, false, false, true, true, false, true, false, true); + assertCorrectValues(columnIndex.getMaxValues(), null, -345.2, -234.6, null, null, 2.99999, null, 42.83, null); + assertCorrectValues(columnIndex.getMinValues(), null, -532.3, -234.7, null, null, -234.6, null, 3.0, null); + assertCorrectFiltering(columnIndex, eq(col, 0.0), 5); + assertCorrectFiltering(columnIndex, eq(col, null), 0, 1, 2, 3, 4, 6, 8); + assertCorrectFiltering(columnIndex, notEq(col, 0.0), 0, 1, 2, 3, 4, 5, 6, 7, 8); + assertCorrectFiltering(columnIndex, notEq(col, null), 1, 2, 5, 7); + assertCorrectFiltering(columnIndex, gt(col, 2.99999), 7); + assertCorrectFiltering(columnIndex, gtEq(col, 2.99999), 5, 7); + assertCorrectFiltering(columnIndex, lt(col, -234.6), 1, 2); + assertCorrectFiltering(columnIndex, ltEq(col, -234.6), 1, 2, 5); + assertCorrectFiltering(columnIndex, userDefined(col, DoubleIsInteger.class), 1, 5, 7); + assertCorrectFiltering(columnIndex, invert(userDefined(col, DoubleIsInteger.class)), 0, 1, 2, 3, 4, 5, 6, 7, 8); + + builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE); + sb = new StatsBuilder(); + builder.add(sb.stats(type, null, null, null, null, null)); + builder.add(sb.stats(type, 532.3, 345.2)); + builder.add(sb.stats(type, null, null, null)); + builder.add(sb.stats(type, 234.7, 234.6, null)); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, 234.69, -2.99999)); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, -3.0, -42.83)); + assertEquals(9, builder.getPageCount()); + assertEquals(sb.getMinMaxSize(), builder.getMinMaxSize()); + columnIndex = builder.build(); + assertEquals(BoundaryOrder.DESCENDING, columnIndex.getBoundaryOrder()); + assertCorrectNullCounts(columnIndex, 5, 0, 3, 1, 2, 0, 2, 2, 0); + assertCorrectNullPages(columnIndex, true, false, true, false, true, false, true, true, false); + assertCorrectValues(columnIndex.getMaxValues(), null, 532.3, null, 234.7, null, 234.69, null, null, -3.0); + assertCorrectValues(columnIndex.getMinValues(), null, 345.2, null, 234.6, null, -2.99999, null, null, -42.83); + assertCorrectFiltering(columnIndex, eq(col, 234.6), 3, 5); + assertCorrectFiltering(columnIndex, eq(col, null), 0, 2, 3, 4, 6, 7); + assertCorrectFiltering(columnIndex, notEq(col, 2.2), 0, 1, 2, 3, 4, 5, 6, 7, 8); + assertCorrectFiltering(columnIndex, notEq(col, null), 1, 3, 5, 8); + assertCorrectFiltering(columnIndex, gt(col, 2.2), 1, 3, 5); + assertCorrectFiltering(columnIndex, gtEq(col, 234.69), 1, 3, 5); + assertCorrectFiltering(columnIndex, lt(col, -2.99999), 8); + assertCorrectFiltering(columnIndex, ltEq(col, -2.99999), 5, 8); + assertCorrectFiltering(columnIndex, userDefined(col, DoubleIsInteger.class), 1, 5, 8); + assertCorrectFiltering(columnIndex, invert(userDefined(col, DoubleIsInteger.class)), 0, 1, 2, 3, 4, 5, 6, 7, 8); + } + + @Test + public void testBuildDoubleZeroNaN() { + PrimitiveType type = Types.required(DOUBLE).named("test_double"); + ColumnIndexBuilder builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE); + StatsBuilder sb = new StatsBuilder(); + builder.add(sb.stats(type, -1.0, -0.0)); + builder.add(sb.stats(type, 0.0, 1.0)); + builder.add(sb.stats(type, 1.0, 100.0)); + ColumnIndex columnIndex = builder.build(); + assertCorrectValues(columnIndex.getMinValues(), -1.0, -0.0, 1.0); + assertCorrectValues(columnIndex.getMaxValues(), 0.0, 1.0, 100.0); + + builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE); + builder.add(sb.stats(type, -1.0, -0.0)); + builder.add(sb.stats(type, 0.0, Double.NaN)); + builder.add(sb.stats(type, 1.0, 100.0)); + assertNull(builder.build()); + } + + @Test + public void testStaticBuildDouble() { + ColumnIndex columnIndex = ColumnIndexBuilder.build( + Types.required(DOUBLE).named("test_double"), + BoundaryOrder.UNORDERED, + asList(false, false, false, false, false, false), + asList(0l, 1l, 2l, 3l, 4l, 5l), + toBBList(-1.0, -2.0, -3.0, -4.0, -5.0, -6.0), + toBBList(1.0, 2.0, 3.0, 4.0, 5.0, 6.0)); + assertEquals(BoundaryOrder.UNORDERED, columnIndex.getBoundaryOrder()); + assertCorrectNullCounts(columnIndex, 0, 1, 2, 3, 4, 5); + assertCorrectNullPages(columnIndex, false, false, false, false, false, false); + assertCorrectValues(columnIndex.getMaxValues(), 1.0, 2.0, 3.0, 4.0, 5.0, 6.0); + assertCorrectValues(columnIndex.getMinValues(), -1.0, -2.0, -3.0, -4.0, -5.0, -6.0); + } + + @Test + public void testBuildFloat() { + PrimitiveType type = Types.required(FLOAT).named("test_float"); + ColumnIndexBuilder builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE); + assertThat(builder, instanceOf(FloatColumnIndexBuilder.class)); + assertNull(builder.build()); + FloatColumn col = floatColumn("test_col"); + + StatsBuilder sb = new StatsBuilder(); + builder.add(sb.stats(type, -4.2f, -4.1f)); + builder.add(sb.stats(type, -11.7f, 7.0f, null)); + builder.add(sb.stats(type, 2.2f, 2.2f, null, null)); + builder.add(sb.stats(type, null, null, null)); + builder.add(sb.stats(type, 1.9f, 2.32f)); + builder.add(sb.stats(type, -21.0f, 8.1f)); + assertEquals(6, builder.getPageCount()); + assertEquals(sb.getMinMaxSize(), builder.getMinMaxSize()); + ColumnIndex columnIndex = builder.build(); + assertEquals(BoundaryOrder.UNORDERED, columnIndex.getBoundaryOrder()); + assertCorrectNullCounts(columnIndex, 0, 1, 2, 3, 0, 0); + assertCorrectNullPages(columnIndex, false, false, false, true, false, false); + assertCorrectValues(columnIndex.getMaxValues(), -4.1f, 7.0f, 2.2f, null, 2.32f, 8.1f); + assertCorrectValues(columnIndex.getMinValues(), -4.2f, -11.7f, 2.2f, null, 1.9f, -21.0f); + assertCorrectFiltering(columnIndex, eq(col, 0.0f), 1, 5); + assertCorrectFiltering(columnIndex, eq(col, null), 1, 2, 3); + assertCorrectFiltering(columnIndex, notEq(col, 2.2f), 0, 1, 2, 3, 4, 5); + assertCorrectFiltering(columnIndex, notEq(col, null), 0, 1, 2, 4, 5); + assertCorrectFiltering(columnIndex, gt(col, 2.2f), 1, 4, 5); + assertCorrectFiltering(columnIndex, gtEq(col, 2.2f), 1, 2, 4, 5); + assertCorrectFiltering(columnIndex, lt(col, 0.0f), 0, 1, 5); + assertCorrectFiltering(columnIndex, ltEq(col, 1.9f), 0, 1, 4, 5); + assertCorrectFiltering(columnIndex, userDefined(col, FloatIsInteger.class), 1, 4, 5); + assertCorrectFiltering(columnIndex, invert(userDefined(col, FloatIsInteger.class)), 0, 1, 2, 3, 4, 5); + + builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE); + sb = new StatsBuilder(); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, -532.3f, -345.2f, null, null)); + builder.add(sb.stats(type, -300.6f, -234.7f, null)); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, null, null, null)); + builder.add(sb.stats(type, -234.6f, 2.99999f)); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, 3.0f, 42.83f)); + builder.add(sb.stats(type, null, null)); + assertEquals(9, builder.getPageCount()); + assertEquals(sb.getMinMaxSize(), builder.getMinMaxSize()); + columnIndex = builder.build(); + assertEquals(BoundaryOrder.ASCENDING, columnIndex.getBoundaryOrder()); + assertCorrectNullCounts(columnIndex, 2, 2, 1, 2, 3, 0, 2, 0, 2); + assertCorrectNullPages(columnIndex, true, false, false, true, true, false, true, false, true); + assertCorrectValues(columnIndex.getMaxValues(), null, -345.2f, -234.7f, null, null, 2.99999f, null, 42.83f, null); + assertCorrectValues(columnIndex.getMinValues(), null, -532.3f, -300.6f, null, null, -234.6f, null, 3.0f, null); + assertCorrectFiltering(columnIndex, eq(col, 0.0f), 5); + assertCorrectFiltering(columnIndex, eq(col, null), 0, 1, 2, 3, 4, 6, 8); + assertCorrectFiltering(columnIndex, notEq(col, 2.2f), 0, 1, 2, 3, 4, 5, 6, 7, 8); + assertCorrectFiltering(columnIndex, notEq(col, null), 1, 2, 5, 7); + assertCorrectFiltering(columnIndex, gt(col, 2.2f), 5, 7); + assertCorrectFiltering(columnIndex, gtEq(col, -234.7f), 2, 5, 7); + assertCorrectFiltering(columnIndex, lt(col, -234.6f), 1, 2); + assertCorrectFiltering(columnIndex, ltEq(col, -234.6f), 1, 2, 5); + assertCorrectFiltering(columnIndex, userDefined(col, FloatIsInteger.class), 1, 2, 5, 7); + assertCorrectFiltering(columnIndex, invert(userDefined(col, FloatIsInteger.class)), 0, 1, 2, 3, 4, 5, 6, 7, 8); + + builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE); + sb = new StatsBuilder(); + builder.add(sb.stats(type, null, null, null, null, null)); + builder.add(sb.stats(type, 532.3f, 345.2f)); + builder.add(sb.stats(type, null, null, null)); + builder.add(sb.stats(type, 234.7f, 234.6f, null)); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, 234.6f, -2.99999f)); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, -3.0f, -42.83f)); + assertEquals(9, builder.getPageCount()); + assertEquals(sb.getMinMaxSize(), builder.getMinMaxSize()); + columnIndex = builder.build(); + assertEquals(BoundaryOrder.DESCENDING, columnIndex.getBoundaryOrder()); + assertCorrectNullCounts(columnIndex, 5, 0, 3, 1, 2, 0, 2, 2, 0); + assertCorrectNullPages(columnIndex, true, false, true, false, true, false, true, true, false); + assertCorrectValues(columnIndex.getMaxValues(), null, 532.3f, null, 234.7f, null, 234.6f, null, null, -3.0f); + assertCorrectValues(columnIndex.getMinValues(), null, 345.2f, null, 234.6f, null, -2.99999f, null, null, -42.83f); + assertCorrectFiltering(columnIndex, eq(col, 234.65f), 3); + assertCorrectFiltering(columnIndex, eq(col, null), 0, 2, 3, 4, 6, 7); + assertCorrectFiltering(columnIndex, notEq(col, 2.2f), 0, 1, 2, 3, 4, 5, 6, 7, 8); + assertCorrectFiltering(columnIndex, notEq(col, null), 1, 3, 5, 8); + assertCorrectFiltering(columnIndex, gt(col, 2.2f), 1, 3, 5); + assertCorrectFiltering(columnIndex, gtEq(col, 2.2f), 1, 3, 5); + assertCorrectFiltering(columnIndex, lt(col, 0.0f), 5, 8); + assertCorrectFiltering(columnIndex, ltEq(col, 0.0f), 5, 8); + assertCorrectFiltering(columnIndex, userDefined(col, FloatIsInteger.class), 1, 5, 8); + assertCorrectFiltering(columnIndex, invert(userDefined(col, FloatIsInteger.class)), 0, 1, 2, 3, 4, 5, 6, 7, 8); + } + + @Test + public void testBuildFloatZeroNaN() { + PrimitiveType type = Types.required(FLOAT).named("test_float"); + ColumnIndexBuilder builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE); + StatsBuilder sb = new StatsBuilder(); + builder.add(sb.stats(type, -1.0f, -0.0f)); + builder.add(sb.stats(type, 0.0f, 1.0f)); + builder.add(sb.stats(type, 1.0f, 100.0f)); + ColumnIndex columnIndex = builder.build(); + assertCorrectValues(columnIndex.getMinValues(), -1.0f, -0.0f, 1.0f); + assertCorrectValues(columnIndex.getMaxValues(), 0.0f, 1.0f, 100.0f); + + builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE); + builder.add(sb.stats(type, -1.0f, -0.0f)); + builder.add(sb.stats(type, 0.0f, Float.NaN)); + builder.add(sb.stats(type, 1.0f, 100.0f)); + assertNull(builder.build()); + } + + @Test + public void testStaticBuildFloat() { + ColumnIndex columnIndex = ColumnIndexBuilder.build( + Types.required(FLOAT).named("test_float"), + BoundaryOrder.ASCENDING, + asList(true, true, true, false, false, false), + asList(9l, 8l, 7l, 6l, 0l, 0l), + toBBList(null, null, null, -3.0f, -2.0f, 0.1f), + toBBList(null, null, null, -2.0f, 0.0f, 6.0f)); + assertEquals(BoundaryOrder.ASCENDING, columnIndex.getBoundaryOrder()); + assertCorrectNullCounts(columnIndex, 9, 8, 7, 6, 0, 0); + assertCorrectNullPages(columnIndex, true, true, true, false, false, false); + assertCorrectValues(columnIndex.getMaxValues(), null, null, null, -2.0f, 0.0f, 6.0f); + assertCorrectValues(columnIndex.getMinValues(), null, null, null, -3.0f, -2.0f, 0.1f); + } + + @Test + public void testBuildInt32() { + PrimitiveType type = Types.required(INT32).named("test_int32"); + ColumnIndexBuilder builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE); + assertThat(builder, instanceOf(IntColumnIndexBuilder.class)); + assertNull(builder.build()); + IntColumn col = intColumn("test_col"); + + StatsBuilder sb = new StatsBuilder(); + builder.add(sb.stats(type, -4, 10)); + builder.add(sb.stats(type, -11, 7, null)); + builder.add(sb.stats(type, 2, 2, null, null)); + builder.add(sb.stats(type, null, null, null)); + builder.add(sb.stats(type, 1, 2)); + builder.add(sb.stats(type, -21, 8)); + assertEquals(6, builder.getPageCount()); + assertEquals(sb.getMinMaxSize(), builder.getMinMaxSize()); + ColumnIndex columnIndex = builder.build(); + assertEquals(BoundaryOrder.UNORDERED, columnIndex.getBoundaryOrder()); + assertCorrectNullCounts(columnIndex, 0, 1, 2, 3, 0, 0); + assertCorrectNullPages(columnIndex, false, false, false, true, false, false); + assertCorrectValues(columnIndex.getMaxValues(), 10, 7, 2, null, 2, 8); + assertCorrectValues(columnIndex.getMinValues(), -4, -11, 2, null, 1, -21); + assertCorrectFiltering(columnIndex, eq(col, 2), 0, 1, 2, 4, 5); + assertCorrectFiltering(columnIndex, eq(col, null), 1, 2, 3); + assertCorrectFiltering(columnIndex, notEq(col, 2), 0, 1, 2, 3, 4, 5); + assertCorrectFiltering(columnIndex, notEq(col, null), 0, 1, 2, 4, 5); + assertCorrectFiltering(columnIndex, gt(col, 2), 0, 1, 5); + assertCorrectFiltering(columnIndex, gtEq(col, 2), 0, 1, 2, 4, 5); + assertCorrectFiltering(columnIndex, lt(col, 2), 0, 1, 4, 5); + assertCorrectFiltering(columnIndex, ltEq(col, 2), 0, 1, 2, 4, 5); + assertCorrectFiltering(columnIndex, userDefined(col, IntegerIsDivisableWith3.class), 0, 1, 5); + assertCorrectFiltering(columnIndex, invert(userDefined(col, IntegerIsDivisableWith3.class)), 0, 1, 2, 3, 4, 5); + + builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE); + sb = new StatsBuilder(); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, -532, -345, null, null)); + builder.add(sb.stats(type, -500, -42, null)); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, null, null, null)); + builder.add(sb.stats(type, -42, 2)); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, 3, 42)); + builder.add(sb.stats(type, null, null)); + assertEquals(9, builder.getPageCount()); + assertEquals(sb.getMinMaxSize(), builder.getMinMaxSize()); + columnIndex = builder.build(); + assertEquals(BoundaryOrder.ASCENDING, columnIndex.getBoundaryOrder()); + assertCorrectNullCounts(columnIndex, 2, 2, 1, 2, 3, 0, 2, 0, 2); + assertCorrectNullPages(columnIndex, true, false, false, true, true, false, true, false, true); + assertCorrectValues(columnIndex.getMaxValues(), null, -345, -42, null, null, 2, null, 42, null); + assertCorrectValues(columnIndex.getMinValues(), null, -532, -500, null, null, -42, null, 3, null); + assertCorrectFiltering(columnIndex, eq(col, 2), 5); + assertCorrectFiltering(columnIndex, eq(col, null), 0, 1, 2, 3, 4, 6, 8); + assertCorrectFiltering(columnIndex, notEq(col, 2), 0, 1, 2, 3, 4, 5, 6, 7, 8); + assertCorrectFiltering(columnIndex, notEq(col, null), 1, 2, 5, 7); + assertCorrectFiltering(columnIndex, gt(col, 2), 7); + assertCorrectFiltering(columnIndex, gtEq(col, 2), 5, 7); + assertCorrectFiltering(columnIndex, lt(col, 2), 1, 2, 5); + assertCorrectFiltering(columnIndex, ltEq(col, 2), 1, 2, 5); + assertCorrectFiltering(columnIndex, userDefined(col, IntegerIsDivisableWith3.class), 1, 2, 5, 7); + assertCorrectFiltering(columnIndex, invert(userDefined(col, IntegerIsDivisableWith3.class)), 0, 1, 2, 3, 4, 5, 6, 7, + 8); + + builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE); + sb = new StatsBuilder(); + builder.add(sb.stats(type, null, null, null, null, null)); + builder.add(sb.stats(type, 532, 345)); + builder.add(sb.stats(type, null, null, null)); + builder.add(sb.stats(type, 234, 42, null)); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, 42, -2)); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, -3, -42)); + assertEquals(9, builder.getPageCount()); + assertEquals(sb.getMinMaxSize(), builder.getMinMaxSize()); + columnIndex = builder.build(); + assertEquals(BoundaryOrder.DESCENDING, columnIndex.getBoundaryOrder()); + assertCorrectNullCounts(columnIndex, 5, 0, 3, 1, 2, 0, 2, 2, 0); + assertCorrectNullPages(columnIndex, true, false, true, false, true, false, true, true, false); + assertCorrectValues(columnIndex.getMaxValues(), null, 532, null, 234, null, 42, null, null, -3); + assertCorrectValues(columnIndex.getMinValues(), null, 345, null, 42, null, -2, null, null, -42); + assertCorrectFiltering(columnIndex, eq(col, 2), 5); + assertCorrectFiltering(columnIndex, eq(col, null), 0, 2, 3, 4, 6, 7); + assertCorrectFiltering(columnIndex, notEq(col, 2), 0, 1, 2, 3, 4, 5, 6, 7, 8); + assertCorrectFiltering(columnIndex, notEq(col, null), 1, 3, 5, 8); + assertCorrectFiltering(columnIndex, gt(col, 2), 1, 3, 5); + assertCorrectFiltering(columnIndex, gtEq(col, 2), 1, 3, 5); + assertCorrectFiltering(columnIndex, lt(col, 2), 5, 8); + assertCorrectFiltering(columnIndex, ltEq(col, 2), 5, 8); + assertCorrectFiltering(columnIndex, userDefined(col, IntegerIsDivisableWith3.class), 1, 3, 5, 8); + assertCorrectFiltering(columnIndex, invert(userDefined(col, IntegerIsDivisableWith3.class)), 0, 1, 2, 3, 4, 5, 6, 7, + 8); + } + + @Test + public void testStaticBuildInt32() { + ColumnIndex columnIndex = ColumnIndexBuilder.build( + Types.required(INT32).named("test_int32"), + BoundaryOrder.DESCENDING, + asList(false, false, false, true, true, true), + asList(0l, 10l, 0l, 3l, 5l, 7l), + toBBList(10, 8, 6, null, null, null), + toBBList(9, 7, 5, null, null, null)); + assertEquals(BoundaryOrder.DESCENDING, columnIndex.getBoundaryOrder()); + assertCorrectNullCounts(columnIndex, 0, 10, 0, 3, 5, 7); + assertCorrectNullPages(columnIndex, false, false, false, true, true, true); + assertCorrectValues(columnIndex.getMaxValues(), 9, 7, 5, null, null, null); + assertCorrectValues(columnIndex.getMinValues(), 10, 8, 6, null, null, null); + } + + @Test + public void testBuildUInt8() { + PrimitiveType type = Types.required(INT32).as(UINT_8).named("test_uint8"); + ColumnIndexBuilder builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE); + assertThat(builder, instanceOf(IntColumnIndexBuilder.class)); + assertNull(builder.build()); + IntColumn col = intColumn("test_col"); + + StatsBuilder sb = new StatsBuilder(); + builder.add(sb.stats(type, 4, 10)); + builder.add(sb.stats(type, 11, 17, null)); + builder.add(sb.stats(type, 2, 2, null, null)); + builder.add(sb.stats(type, null, null, null)); + builder.add(sb.stats(type, 1, 0xFF)); + builder.add(sb.stats(type, 0xEF, 0xFA)); + assertEquals(6, builder.getPageCount()); + assertEquals(sb.getMinMaxSize(), builder.getMinMaxSize()); + ColumnIndex columnIndex = builder.build(); + assertEquals(BoundaryOrder.UNORDERED, columnIndex.getBoundaryOrder()); + assertCorrectNullCounts(columnIndex, 0, 1, 2, 3, 0, 0); + assertCorrectNullPages(columnIndex, false, false, false, true, false, false); + assertCorrectValues(columnIndex.getMaxValues(), 10, 17, 2, null, 0xFF, 0xFA); + assertCorrectValues(columnIndex.getMinValues(), 4, 11, 2, null, 1, 0xEF); + assertCorrectFiltering(columnIndex, eq(col, 2), 2, 4); + assertCorrectFiltering(columnIndex, eq(col, null), 1, 2, 3); + assertCorrectFiltering(columnIndex, notEq(col, 2), 0, 1, 2, 3, 4, 5); + assertCorrectFiltering(columnIndex, notEq(col, null), 0, 1, 2, 4, 5); + assertCorrectFiltering(columnIndex, gt(col, 2), 0, 1, 4, 5); + assertCorrectFiltering(columnIndex, gtEq(col, 2), 0, 1, 2, 4, 5); + assertCorrectFiltering(columnIndex, lt(col, 0xEF), 0, 1, 2, 4); + assertCorrectFiltering(columnIndex, ltEq(col, 0xEF), 0, 1, 2, 4, 5); + assertCorrectFiltering(columnIndex, userDefined(col, IntegerIsDivisableWith3.class), 0, 1, 4, 5); + assertCorrectFiltering(columnIndex, invert(userDefined(col, IntegerIsDivisableWith3.class)), 0, 1, 2, 3, 4, 5); + + builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE); + sb = new StatsBuilder(); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, 0, 0, null, null)); + builder.add(sb.stats(type, 0, 42, null)); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, null, null, null)); + builder.add(sb.stats(type, 42, 0xEE)); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, 0xEF, 0xFF)); + builder.add(sb.stats(type, null, null)); + assertEquals(9, builder.getPageCount()); + assertEquals(sb.getMinMaxSize(), builder.getMinMaxSize()); + columnIndex = builder.build(); + assertEquals(BoundaryOrder.ASCENDING, columnIndex.getBoundaryOrder()); + assertCorrectNullCounts(columnIndex, 2, 2, 1, 2, 3, 0, 2, 0, 2); + assertCorrectNullPages(columnIndex, true, false, false, true, true, false, true, false, true); + assertCorrectValues(columnIndex.getMaxValues(), null, 0, 42, null, null, 0xEE, null, 0xFF, null); + assertCorrectValues(columnIndex.getMinValues(), null, 0, 0, null, null, 42, null, 0xEF, null); + assertCorrectFiltering(columnIndex, eq(col, 2), 2); + assertCorrectFiltering(columnIndex, eq(col, null), 0, 1, 2, 3, 4, 6, 8); + assertCorrectFiltering(columnIndex, notEq(col, 2), 0, 1, 2, 3, 4, 5, 6, 7, 8); + assertCorrectFiltering(columnIndex, notEq(col, null), 1, 2, 5, 7); + assertCorrectFiltering(columnIndex, gt(col, 0xEE), 7); + assertCorrectFiltering(columnIndex, gtEq(col, 0xEE), 5, 7); + assertCorrectFiltering(columnIndex, lt(col, 42), 1, 2); + assertCorrectFiltering(columnIndex, ltEq(col, 42), 1, 2, 5); + assertCorrectFiltering(columnIndex, userDefined(col, IntegerIsDivisableWith3.class), 1, 2, 5, 7); + assertCorrectFiltering(columnIndex, invert(userDefined(col, IntegerIsDivisableWith3.class)), 0, 1, 2, 3, 4, 5, 6, 7, + 8); + + builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE); + sb = new StatsBuilder(); + builder.add(sb.stats(type, null, null, null, null, null)); + builder.add(sb.stats(type, 0xFF, 0xFF)); + builder.add(sb.stats(type, null, null, null)); + builder.add(sb.stats(type, 0xEF, 0xEA, null)); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, 0xEE, 42)); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, 41, 0)); + assertEquals(9, builder.getPageCount()); + assertEquals(sb.getMinMaxSize(), builder.getMinMaxSize()); + columnIndex = builder.build(); + assertEquals(BoundaryOrder.DESCENDING, columnIndex.getBoundaryOrder()); + assertCorrectNullCounts(columnIndex, 5, 0, 3, 1, 2, 0, 2, 2, 0); + assertCorrectNullPages(columnIndex, true, false, true, false, true, false, true, true, false); + assertCorrectValues(columnIndex.getMaxValues(), null, 0xFF, null, 0xEF, null, 0xEE, null, null, 41); + assertCorrectValues(columnIndex.getMinValues(), null, 0xFF, null, 0xEA, null, 42, null, null, 0); + assertCorrectFiltering(columnIndex, eq(col, 0xAB), 5); + assertCorrectFiltering(columnIndex, eq(col, null), 0, 2, 3, 4, 6, 7); + assertCorrectFiltering(columnIndex, notEq(col, 0xFF), 0, 2, 3, 4, 5, 6, 7, 8); + assertCorrectFiltering(columnIndex, notEq(col, null), 1, 3, 5, 8); + assertCorrectFiltering(columnIndex, gt(col, 0xFF)); + assertCorrectFiltering(columnIndex, gtEq(col, 0xFF), 1); + assertCorrectFiltering(columnIndex, lt(col, 42), 8); + assertCorrectFiltering(columnIndex, ltEq(col, 42), 5, 8); + assertCorrectFiltering(columnIndex, userDefined(col, IntegerIsDivisableWith3.class), 1, 3, 5, 8); + assertCorrectFiltering(columnIndex, invert(userDefined(col, IntegerIsDivisableWith3.class)), 0, 2, 3, 4, 5, 6, 7, + 8); + } + + @Test + public void testBuildInt64() { + PrimitiveType type = Types.required(INT64).named("test_int64"); + ColumnIndexBuilder builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE); + assertThat(builder, instanceOf(LongColumnIndexBuilder.class)); + assertNull(builder.build()); + LongColumn col = longColumn("test_col"); + + StatsBuilder sb = new StatsBuilder(); + builder.add(sb.stats(type, -4l, 10l)); + builder.add(sb.stats(type, -11l, 7l, null)); + builder.add(sb.stats(type, 2l, 2l, null, null)); + builder.add(sb.stats(type, null, null, null)); + builder.add(sb.stats(type, 1l, 2l)); + builder.add(sb.stats(type, -21l, 8l)); + assertEquals(6, builder.getPageCount()); + assertEquals(sb.getMinMaxSize(), builder.getMinMaxSize()); + ColumnIndex columnIndex = builder.build(); + assertEquals(BoundaryOrder.UNORDERED, columnIndex.getBoundaryOrder()); + assertCorrectNullCounts(columnIndex, 0l, 1l, 2l, 3l, 0l, 0l); + assertCorrectNullPages(columnIndex, false, false, false, true, false, false); + assertCorrectValues(columnIndex.getMaxValues(), 10l, 7l, 2l, null, 2l, 8l); + assertCorrectValues(columnIndex.getMinValues(), -4l, -11l, 2l, null, 1l, -21l); + assertCorrectFiltering(columnIndex, eq(col, 0l), 0, 1, 5); + assertCorrectFiltering(columnIndex, eq(col, null), 1, 2, 3); + assertCorrectFiltering(columnIndex, notEq(col, 0l), 0, 1, 2, 3, 4, 5); + assertCorrectFiltering(columnIndex, notEq(col, null), 0, 1, 2, 4, 5); + assertCorrectFiltering(columnIndex, gt(col, 2l), 0, 1, 5); + assertCorrectFiltering(columnIndex, gtEq(col, 2l), 0, 1, 2, 4, 5); + assertCorrectFiltering(columnIndex, lt(col, -21l)); + assertCorrectFiltering(columnIndex, ltEq(col, -21l), 5); + assertCorrectFiltering(columnIndex, userDefined(col, LongIsDivisableWith3.class), 0, 1, 5); + assertCorrectFiltering(columnIndex, invert(userDefined(col, LongIsDivisableWith3.class)), 0, 1, 2, 3, 4, 5); + + builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE); + sb = new StatsBuilder(); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, -532l, -345l, null, null)); + builder.add(sb.stats(type, -234l, -42l, null)); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, null, null, null)); + builder.add(sb.stats(type, -42l, 2l)); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, -3l, 42l)); + builder.add(sb.stats(type, null, null)); + assertEquals(9, builder.getPageCount()); + assertEquals(sb.getMinMaxSize(), builder.getMinMaxSize()); + columnIndex = builder.build(); + assertEquals(BoundaryOrder.ASCENDING, columnIndex.getBoundaryOrder()); + assertCorrectNullCounts(columnIndex, 2, 2, 1, 2, 3, 0, 2, 0, 2); + assertCorrectNullPages(columnIndex, true, false, false, true, true, false, true, false, true); + assertCorrectValues(columnIndex.getMaxValues(), null, -345l, -42l, null, null, 2l, null, 42l, null); + assertCorrectValues(columnIndex.getMinValues(), null, -532l, -234l, null, null, -42l, null, -3l, null); + assertCorrectFiltering(columnIndex, eq(col, -42l), 2, 5); + assertCorrectFiltering(columnIndex, eq(col, null), 0, 1, 2, 3, 4, 6, 8); + assertCorrectFiltering(columnIndex, notEq(col, -42l), 0, 1, 2, 3, 4, 5, 6, 7, 8); + assertCorrectFiltering(columnIndex, notEq(col, null), 1, 2, 5, 7); + assertCorrectFiltering(columnIndex, gt(col, 2l), 7); + assertCorrectFiltering(columnIndex, gtEq(col, 2l), 5, 7); + assertCorrectFiltering(columnIndex, lt(col, -42l), 1, 2); + assertCorrectFiltering(columnIndex, ltEq(col, -42l), 1, 2, 5); + assertCorrectFiltering(columnIndex, userDefined(col, LongIsDivisableWith3.class), 1, 2, 5, 7); + assertCorrectFiltering(columnIndex, invert(userDefined(col, LongIsDivisableWith3.class)), 0, 1, 2, 3, 4, 5, 6, 7, + 8); + + builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE); + sb = new StatsBuilder(); + builder.add(sb.stats(type, null, null, null, null, null)); + builder.add(sb.stats(type, 532l, 345l)); + builder.add(sb.stats(type, null, null, null)); + builder.add(sb.stats(type, 234l, 42l, null)); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, 42l, -2l)); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, null, null)); + builder.add(sb.stats(type, -3l, -42l)); + assertEquals(9, builder.getPageCount()); + assertEquals(sb.getMinMaxSize(), builder.getMinMaxSize()); + columnIndex = builder.build(); + assertEquals(BoundaryOrder.DESCENDING, columnIndex.getBoundaryOrder()); + assertCorrectNullCounts(columnIndex, 5, 0, 3, 1, 2, 0, 2, 2, 0); + assertCorrectNullPages(columnIndex, true, false, true, false, true, false, true, true, false); + assertCorrectValues(columnIndex.getMaxValues(), null, 532l, null, 234l, null, 42l, null, null, -3l); + assertCorrectValues(columnIndex.getMinValues(), null, 345l, null, 42l, null, -2l, null, null, -42l); + assertCorrectFiltering(columnIndex, eq(col, 0l), 5); + assertCorrectFiltering(columnIndex, eq(col, null), 0, 2, 3, 4, 6, 7); + assertCorrectFiltering(columnIndex, notEq(col, 0l), 0, 1, 2, 3, 4, 5, 6, 7, 8); + assertCorrectFiltering(columnIndex, notEq(col, null), 1, 3, 5, 8); + assertCorrectFiltering(columnIndex, gt(col, 2l), 1, 3, 5); + assertCorrectFiltering(columnIndex, gtEq(col, 2l), 1, 3, 5); + assertCorrectFiltering(columnIndex, lt(col, -42l)); + assertCorrectFiltering(columnIndex, ltEq(col, -42l), 8); + assertCorrectFiltering(columnIndex, userDefined(col, LongIsDivisableWith3.class), 1, 3, 5, 8); + assertCorrectFiltering(columnIndex, invert(userDefined(col, LongIsDivisableWith3.class)), 0, 1, 2, 3, 4, 5, 6, 7, + 8); + } + + @Test + public void testStaticBuildInt64() { + ColumnIndex columnIndex = ColumnIndexBuilder.build( + Types.required(INT64).named("test_int64"), + BoundaryOrder.UNORDERED, + asList(true, false, true, false, true, false), + asList(1l, 2l, 3l, 4l, 5l, 6l), + toBBList(null, 2l, null, 4l, null, 9l), + toBBList(null, 3l, null, 15l, null, 10l)); + assertEquals(BoundaryOrder.UNORDERED, columnIndex.getBoundaryOrder()); + assertCorrectNullCounts(columnIndex, 1, 2, 3, 4, 5, 6); + assertCorrectNullPages(columnIndex, true, false, true, false, true, false); + assertCorrectValues(columnIndex.getMaxValues(), null, 3l, null, 15l, null, 10l); + assertCorrectValues(columnIndex.getMinValues(), null, 2l, null, 4l, null, 9l); + } + + @Test + public void testNoOpBuilder() { + ColumnIndexBuilder builder = ColumnIndexBuilder.getNoOpBuilder(); + StatsBuilder sb = new StatsBuilder(); + builder.add(sb.stats(Types.required(BINARY).as(UTF8).named("test_binary_utf8"), stringBinary("Jeltz"), + stringBinary("Slartibartfast"), null, null)); + builder.add(sb.stats(Types.required(BOOLEAN).named("test_boolean"), true, true, null, null)); + builder.add(sb.stats(Types.required(DOUBLE).named("test_double"), null, null, null)); + builder.add(sb.stats(Types.required(INT32).named("test_int32"), null, null)); + builder.add(sb.stats(Types.required(INT64).named("test_int64"), -234l, -42l, null)); + assertEquals(0, builder.getPageCount()); + assertEquals(0, builder.getMinMaxSize()); + assertNull(builder.build()); + } + + private static List toBBList(Binary... values) { + List buffers = new ArrayList<>(values.length); + for (Binary value : values) { + if (value == null) { + buffers.add(ByteBuffer.allocate(0)); + } else { + buffers.add(value.toByteBuffer()); + } + } + return buffers; + } + + private static List toBBList(Boolean... values) { + List buffers = new ArrayList<>(values.length); + for (Boolean value : values) { + if (value == null) { + buffers.add(ByteBuffer.allocate(0)); + } else { + buffers.add(ByteBuffer.wrap(BytesUtils.booleanToBytes(value))); + } + } + return buffers; + } + + private static List toBBList(Double... values) { + List buffers = new ArrayList<>(values.length); + for (Double value : values) { + if (value == null) { + buffers.add(ByteBuffer.allocate(0)); + } else { + buffers.add(ByteBuffer.wrap(BytesUtils.longToBytes(Double.doubleToLongBits(value)))); + } + } + return buffers; + } + + private static List toBBList(Float... values) { + List buffers = new ArrayList<>(values.length); + for (Float value : values) { + if (value == null) { + buffers.add(ByteBuffer.allocate(0)); + } else { + buffers.add(ByteBuffer.wrap(BytesUtils.intToBytes(Float.floatToIntBits(value)))); + } + } + return buffers; + } + + private static List toBBList(Integer... values) { + List buffers = new ArrayList<>(values.length); + for (Integer value : values) { + if (value == null) { + buffers.add(ByteBuffer.allocate(0)); + } else { + buffers.add(ByteBuffer.wrap(BytesUtils.intToBytes(value))); + } + } + return buffers; + } + + private static List toBBList(Long... values) { + List buffers = new ArrayList<>(values.length); + for (Long value : values) { + if (value == null) { + buffers.add(ByteBuffer.allocate(0)); + } else { + buffers.add(ByteBuffer.wrap(BytesUtils.longToBytes(value))); + } + } + return buffers; + } + + private static Binary decimalBinary(String num) { + return Binary.fromConstantByteArray(new BigDecimal(num).unscaledValue().toByteArray()); + } + + private static Binary stringBinary(String str) { + return Binary.fromString(str); + } + + private static void assertCorrectValues(List values, Binary... expectedValues) { + assertEquals(expectedValues.length, values.size()); + for (int i = 0; i < expectedValues.length; ++i) { + Binary expectedValue = expectedValues[i]; + ByteBuffer value = values.get(i); + if (expectedValue == null) { + assertFalse("The byte buffer should be empty for null pages", value.hasRemaining()); + } else { + assertArrayEquals("Invalid value for page " + i, expectedValue.getBytesUnsafe(), value.array()); + } + } + } + + private static void assertCorrectValues(List values, Boolean... expectedValues) { + assertEquals(expectedValues.length, values.size()); + for (int i = 0; i < expectedValues.length; ++i) { + Boolean expectedValue = expectedValues[i]; + ByteBuffer value = values.get(i); + if (expectedValue == null) { + assertFalse("The byte buffer should be empty for null pages", value.hasRemaining()); + } else { + assertEquals("The byte buffer should be 1 byte long for boolean", 1, value.remaining()); + assertEquals("Invalid value for page " + i, expectedValue.booleanValue(), value.get(0) != 0); + } + } + } + + private static void assertCorrectValues(List values, Double... expectedValues) { + assertEquals(expectedValues.length, values.size()); + for (int i = 0; i < expectedValues.length; ++i) { + Double expectedValue = expectedValues[i]; + ByteBuffer value = values.get(i); + if (expectedValue == null) { + assertFalse("The byte buffer should be empty for null pages", value.hasRemaining()); + } else { + assertEquals("The byte buffer should be 8 bytes long for double", 8, value.remaining()); + assertTrue("Invalid value for page " + i, Double.compare(expectedValue.doubleValue(), value.getDouble(0)) == 0); + } + } + } + + private static void assertCorrectValues(List values, Float... expectedValues) { + assertEquals(expectedValues.length, values.size()); + for (int i = 0; i < expectedValues.length; ++i) { + Float expectedValue = expectedValues[i]; + ByteBuffer value = values.get(i); + if (expectedValue == null) { + assertFalse("The byte buffer should be empty for null pages", value.hasRemaining()); + } else { + assertEquals("The byte buffer should be 4 bytes long for double", 4, value.remaining()); + assertTrue("Invalid value for page " + i, Float.compare(expectedValue.floatValue(), value.getFloat(0)) == 0); + } + } + } + + private static void assertCorrectValues(List values, Integer... expectedValues) { + assertEquals(expectedValues.length, values.size()); + for (int i = 0; i < expectedValues.length; ++i) { + Integer expectedValue = expectedValues[i]; + ByteBuffer value = values.get(i); + if (expectedValue == null) { + assertFalse("The byte buffer should be empty for null pages", value.hasRemaining()); + } else { + assertEquals("The byte buffer should be 4 bytes long for int32", 4, value.remaining()); + assertEquals("Invalid value for page " + i, expectedValue.intValue(), value.getInt(0)); + } + } + } + + private static void assertCorrectValues(List values, Long... expectedValues) { + assertEquals(expectedValues.length, values.size()); + for (int i = 0; i < expectedValues.length; ++i) { + Long expectedValue = expectedValues[i]; + ByteBuffer value = values.get(i); + if (expectedValue == null) { + assertFalse("The byte buffer should be empty for null pages", value.hasRemaining()); + } else { + assertEquals("The byte buffer should be 8 bytes long for int64", 8, value.remaining()); + assertEquals("Invalid value for page " + i, expectedValue.intValue(), value.getLong(0)); + } + } + } + + private static void assertCorrectNullCounts(ColumnIndex columnIndex, long... expectedNullCounts) { + List nullCounts = columnIndex.getNullCounts(); + assertEquals(expectedNullCounts.length, nullCounts.size()); + for (int i = 0; i < expectedNullCounts.length; ++i) { + assertEquals("Invalid null count at page " + i, expectedNullCounts[i], nullCounts.get(i).longValue()); + } + } + + private static void assertCorrectNullPages(ColumnIndex columnIndex, boolean... expectedNullPages) { + List nullPages = columnIndex.getNullPages(); + assertEquals(expectedNullPages.length, nullPages.size()); + for (int i = 0; i < expectedNullPages.length; ++i) { + assertEquals("Invalid null pages at page " + i, expectedNullPages[i], nullPages.get(i).booleanValue()); + } + } + + private static class StatsBuilder { + private long minMaxSize; + + Statistics stats(PrimitiveType type, Object... values) { + Statistics stats = Statistics.createStats(type); + for (Object value : values) { + if (value == null) { + stats.incrementNumNulls(); + continue; + } + switch (type.getPrimitiveTypeName()) { + case BINARY: + case FIXED_LEN_BYTE_ARRAY: + case INT96: + stats.updateStats((Binary) value); + break; + case BOOLEAN: + stats.updateStats((boolean) value); + break; + case DOUBLE: + stats.updateStats((double) value); + break; + case FLOAT: + stats.updateStats((float) value); + break; + case INT32: + stats.updateStats((int) value); + break; + case INT64: + stats.updateStats((long) value); + break; + default: + fail("Unsupported value type for stats: " + value.getClass()); + } + } + if (stats.hasNonNullValue()) { + minMaxSize += stats.getMinBytes().length; + minMaxSize += stats.getMaxBytes().length; + } + return stats; + } + + long getMinMaxSize() { + return minMaxSize; + } + } + + private static void assertCorrectFiltering(ColumnIndex ci, FilterPredicate predicate, int... expectedIndexes) { + TestIndexIterator.assertEquals(predicate.accept(ci), expectedIndexes); + } +} diff --git a/parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestIndexIterator.java b/parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestIndexIterator.java new file mode 100644 index 0000000000..d9047f26d4 --- /dev/null +++ b/parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestIndexIterator.java @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.internal.column.columnindex; + +import static org.junit.Assert.assertArrayEquals; + +import java.util.Arrays; +import java.util.PrimitiveIterator; + +import org.junit.Test; + +import it.unimi.dsi.fastutil.ints.IntArrayList; +import it.unimi.dsi.fastutil.ints.IntList; + +/** + * Unit test for {@link IndexIterator}. + */ +public class TestIndexIterator { + @Test + public void testAll() { + assertEquals(IndexIterator.all(10), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9); + } + + @Test + public void testFilter() { + assertEquals(IndexIterator.filter(30, value -> value % 3 == 0), 0, 3, 6, 9, 12, 15, 18, 21, 24, 27); + } + + @Test + public void testFilterTranslate() { + assertEquals(IndexIterator.filterTranslate(20, value -> value < 5, Math::negateExact), 0, -1, -2, -3, -4); + } + + @Test + public void testRangeTranslate() { + assertEquals(IndexIterator.rangeTranslate(11, 18, i -> i - 10), 1, 2, 3, 4, 5, 6, 7, 8); + } + + static void assertEquals(PrimitiveIterator.OfInt actualIt, int... expectedValues) { + IntList actualList = new IntArrayList(); + actualIt.forEachRemaining((int value) -> actualList.add(value)); + int[] actualValues = actualList.toIntArray(); + assertArrayEquals( + "ExpectedValues: " + Arrays.toString(expectedValues) + " ActualValues: " + Arrays.toString(actualValues), + expectedValues, actualValues); + } +} diff --git a/parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestOffsetIndexBuilder.java b/parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestOffsetIndexBuilder.java new file mode 100644 index 0000000000..1e1275c84f --- /dev/null +++ b/parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestOffsetIndexBuilder.java @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.internal.column.columnindex; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNull; + +import org.apache.parquet.internal.column.columnindex.OffsetIndex; +import org.apache.parquet.internal.column.columnindex.OffsetIndexBuilder; +import org.junit.Test; + +/** + * Tests for {@link OffsetIndexBuilder}. + */ +public class TestOffsetIndexBuilder { + @Test + public void testBuilderWithSizeAndRowCount() { + OffsetIndexBuilder builder = OffsetIndexBuilder.getBuilder(); + assertNull(builder.build()); + assertNull(builder.build(1234)); + + builder.add(1000, 10); + builder.add(2000, 19); + builder.add(3000, 27); + builder.add(1200, 9); + assertCorrectValues(builder.build(), + 0, 1000, 0, + 1000, 2000, 10, + 3000, 3000, 29, + 6000, 1200, 56); + assertCorrectValues(builder.build(10000), + 10000, 1000, 0, + 11000, 2000, 10, + 13000, 3000, 29, + 16000, 1200, 56); + } + + @Test + public void testNoOpBuilderWithSizeAndRowCount() { + OffsetIndexBuilder builder = OffsetIndexBuilder.getNoOpBuilder(); + builder.add(1, 2); + builder.add(3, 4); + builder.add(5, 6); + builder.add(7, 8); + assertNull(builder.build()); + assertNull(builder.build(1000)); + } + + @Test + public void testBuilderWithOffsetSizeIndex() { + OffsetIndexBuilder builder = OffsetIndexBuilder.getBuilder(); + assertNull(builder.build()); + assertNull(builder.build(1234)); + + builder.add(1000, 10000, 0); + builder.add(22000, 12000, 100); + builder.add(48000, 22000, 211); + builder.add(90000, 30000, 361); + assertCorrectValues(builder.build(), + 1000, 10000, 0, + 22000, 12000, 100, + 48000, 22000, 211, + 90000, 30000, 361); + assertCorrectValues(builder.build(100000), + 101000, 10000, 0, + 122000, 12000, 100, + 148000, 22000, 211, + 190000, 30000, 361); + } + + @Test + public void testNoOpBuilderWithOffsetSizeIndex() { + OffsetIndexBuilder builder = OffsetIndexBuilder.getNoOpBuilder(); + builder.add(1, 2, 3); + builder.add(4, 5, 6); + builder.add(7, 8, 9); + builder.add(10, 11, 12); + assertNull(builder.build()); + assertNull(builder.build(1000)); + } + + private void assertCorrectValues(OffsetIndex offsetIndex, long... offset_size_rowIndex_triplets) { + assertEquals(offset_size_rowIndex_triplets.length % 3, 0); + int pageCount = offset_size_rowIndex_triplets.length / 3; + assertEquals("Invalid pageCount", pageCount, offsetIndex.getPageCount()); + for (int i = 0; i < pageCount; ++i) { + assertEquals("Invalid offsetIndex at page " + i, offset_size_rowIndex_triplets[3 * i], + offsetIndex.getOffset(i)); + assertEquals("Invalid compressedPageSize at page " + i, offset_size_rowIndex_triplets[3 * i + 1], + offsetIndex.getCompressedPageSize(i)); + assertEquals("Invalid firstRowIndex at page " + i, offset_size_rowIndex_triplets[3 * i + 2], + offsetIndex.getFirstRowIndex(i)); + long expectedLastPageIndex = (i < pageCount - 1) ? (offset_size_rowIndex_triplets[3 * i + 5] - 1) : 999; + assertEquals("Invalid lastRowIndex at page " + i, expectedLastPageIndex, offsetIndex.getLastRowIndex(i, 1000)); + } + } +} diff --git a/parquet-column/src/test/java/org/apache/parquet/internal/filter2/columnindex/TestColumnIndexFilter.java b/parquet-column/src/test/java/org/apache/parquet/internal/filter2/columnindex/TestColumnIndexFilter.java new file mode 100644 index 0000000000..ae27214582 --- /dev/null +++ b/parquet-column/src/test/java/org/apache/parquet/internal/filter2/columnindex/TestColumnIndexFilter.java @@ -0,0 +1,464 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.internal.filter2.columnindex; + +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.apache.parquet.filter2.predicate.FilterApi.and; +import static org.apache.parquet.filter2.predicate.FilterApi.binaryColumn; +import static org.apache.parquet.filter2.predicate.FilterApi.booleanColumn; +import static org.apache.parquet.filter2.predicate.FilterApi.doubleColumn; +import static org.apache.parquet.filter2.predicate.FilterApi.eq; +import static org.apache.parquet.filter2.predicate.FilterApi.gt; +import static org.apache.parquet.filter2.predicate.FilterApi.gtEq; +import static org.apache.parquet.filter2.predicate.FilterApi.intColumn; +import static org.apache.parquet.filter2.predicate.FilterApi.lt; +import static org.apache.parquet.filter2.predicate.FilterApi.ltEq; +import static org.apache.parquet.filter2.predicate.FilterApi.notEq; +import static org.apache.parquet.filter2.predicate.FilterApi.or; +import static org.apache.parquet.filter2.predicate.FilterApi.userDefined; +import static org.apache.parquet.filter2.predicate.LogicalInverter.invert; +import static org.apache.parquet.internal.column.columnindex.BoundaryOrder.ASCENDING; +import static org.apache.parquet.internal.column.columnindex.BoundaryOrder.DESCENDING; +import static org.apache.parquet.internal.column.columnindex.BoundaryOrder.UNORDERED; +import static org.apache.parquet.internal.filter2.columnindex.ColumnIndexFilter.calculateRowRanges; +import static org.apache.parquet.io.api.Binary.fromString; +import static org.apache.parquet.schema.OriginalType.UTF8; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.DOUBLE; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32; +import static org.apache.parquet.schema.Types.optional; +import static org.junit.Assert.assertArrayEquals; + +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.stream.LongStream; + +import org.apache.parquet.bytes.BytesUtils; +import org.apache.parquet.filter2.compat.FilterCompat; +import org.apache.parquet.filter2.predicate.Statistics; +import org.apache.parquet.filter2.predicate.UserDefinedPredicate; +import org.apache.parquet.hadoop.metadata.ColumnPath; +import org.apache.parquet.internal.column.columnindex.BoundaryOrder; +import org.apache.parquet.internal.column.columnindex.ColumnIndex; +import org.apache.parquet.internal.column.columnindex.ColumnIndexBuilder; +import org.apache.parquet.internal.column.columnindex.OffsetIndex; +import org.apache.parquet.internal.column.columnindex.OffsetIndexBuilder; +import org.apache.parquet.internal.column.columnindex.TestColumnIndexBuilder.BinaryUtf8StartsWithB; +import org.apache.parquet.internal.column.columnindex.TestColumnIndexBuilder.DoubleIsInteger; +import org.apache.parquet.internal.column.columnindex.TestColumnIndexBuilder.IntegerIsDivisableWith3; +import org.apache.parquet.schema.PrimitiveType; +import org.junit.Test; + +import it.unimi.dsi.fastutil.longs.LongArrayList; +import it.unimi.dsi.fastutil.longs.LongList; + +/** + * Unit tests of {@link ColumnIndexFilter} + */ +public class TestColumnIndexFilter { + private static class CIBuilder { + private static final ByteBuffer EMPTY = ByteBuffer.wrap(new byte[0]); + private final PrimitiveType type; + private final BoundaryOrder order; + private List nullPages = new ArrayList<>(); + private List nullCounts = new ArrayList<>(); + private List minValues = new ArrayList<>(); + private List maxValues = new ArrayList<>(); + + CIBuilder(PrimitiveType type, BoundaryOrder order) { + this.type = type; + this.order = order; + } + + CIBuilder addNullPage(long nullCount) { + nullPages.add(true); + nullCounts.add(nullCount); + minValues.add(EMPTY); + maxValues.add(EMPTY); + return this; + } + + CIBuilder addPage(long nullCount, int min, int max) { + nullPages.add(false); + nullCounts.add(nullCount); + minValues.add(ByteBuffer.wrap(BytesUtils.intToBytes(min))); + maxValues.add(ByteBuffer.wrap(BytesUtils.intToBytes(max))); + return this; + } + + CIBuilder addPage(long nullCount, String min, String max) { + nullPages.add(false); + nullCounts.add(nullCount); + minValues.add(ByteBuffer.wrap(min.getBytes(UTF_8))); + maxValues.add(ByteBuffer.wrap(max.getBytes(UTF_8))); + return this; + } + + CIBuilder addPage(long nullCount, double min, double max) { + nullPages.add(false); + nullCounts.add(nullCount); + minValues.add(ByteBuffer.wrap(BytesUtils.longToBytes(Double.doubleToLongBits(min)))); + maxValues.add(ByteBuffer.wrap(BytesUtils.longToBytes(Double.doubleToLongBits(max)))); + return this; + } + + ColumnIndex build() { + return ColumnIndexBuilder.build(type, order, nullPages, nullCounts, minValues, maxValues); + } + } + + private static class OIBuilder { + private final OffsetIndexBuilder builder = OffsetIndexBuilder.getBuilder(); + + OIBuilder addPage(long rowCount) { + builder.add(1234, rowCount); + return this; + } + + OffsetIndex build() { + return builder.build(); + } + } + + public static class AnyInt extends UserDefinedPredicate { + + @Override + public boolean keep(Integer value) { + return true; + } + + @Override + public boolean canDrop(Statistics statistics) { + return false; + } + + @Override + public boolean inverseCanDrop(Statistics statistics) { + return true; + } + + } + + /** + *
+   * row     column1        column2        column3        column4 (no column index)
+   *      ------0------  ------0------  ------0------  ------0------
+   * 0.   1              Zulu           2.03
+   *      ------1------  ------1------  ------1------  ------1------
+   * 1.   2              Yankee         4.67
+   * 2.   3              Xray           3.42
+   * 3.   4              Whiskey        8.71
+   *                     ------2------                 ------2------
+   * 4.   5              Victor         0.56
+   * 5.   6              Uniform        4.30
+   *                                    ------2------  ------3------
+   * 6.   null           null           null
+   *      ------2------                                ------4------
+   * 7.   7              Tango          3.50
+   *                     ------3------
+   * 8.   7              null           3.14
+   *      ------3------
+   * 9.   7              null           null
+   *                                    ------3------
+   * 10.  null           null           9.99
+   *                     ------4------
+   * 11.  8              Sierra         8.78
+   *                                                   ------5------
+   * 12.  9              Romeo          9.56
+   * 13.  10             Quebec         2.71
+   *      ------4------
+   * 14.  11             Papa           5.71
+   * 15.  12             Oscar          4.09
+   *                     ------5------  ------4------  ------6------
+   * 16.  13             November       null
+   * 17.  14             Mike           null
+   * 18.  15             Lima           0.36
+   * 19.  16             Kilo           2.94
+   * 20.  17             Juliett        4.23
+   *      ------5------  ------6------                 ------7------
+   * 21.  18             India          null
+   * 22.  19             Hotel          5.32
+   *                                    ------5------
+   * 23.  20             Golf           4.17
+   * 24.  21             Foxtrot        7.92
+   * 25.  22             Echo           7.95
+   *                                   ------6------
+   * 26.  23             Delta          null
+   *      ------6------
+   * 27.  24             Charlie        null
+   *                                                   ------8------
+   * 28.  25             Bravo          null
+   *                     ------7------
+   * 29.  26             Alfa           null
+   * 
+ */ + private static final long TOTAL_ROW_COUNT = 30; + private static final ColumnIndex COLUMN1_CI = new CIBuilder(optional(INT32).named("column1"), ASCENDING) + .addPage(0, 1, 1) + .addPage(1, 2, 6) + .addPage(0, 7, 7) + .addPage(1, 7, 10) + .addPage(0, 11, 17) + .addPage(0, 18, 23) + .addPage(0, 24, 26) + .build(); + private static final OffsetIndex COLUMN1_OI = new OIBuilder() + .addPage(1) + .addPage(6) + .addPage(2) + .addPage(5) + .addPage(7) + .addPage(6) + .addPage(3) + .build(); + private static final ColumnIndex COLUMN2_CI = new CIBuilder(optional(BINARY).as(UTF8).named("column2"), DESCENDING) + .addPage(0, "Zulu", "Zulu") + .addPage(0, "Whiskey", "Yankee") + .addPage(1, "Tango", "Victor") + .addNullPage(3) + .addPage(0, "Oscar", "Sierra") + .addPage(0, "Juliett", "November") + .addPage(0, "Bravo", "India") + .addPage(0, "Alfa", "Alfa") + .build(); + private static final OffsetIndex COLUMN2_OI = new OIBuilder() + .addPage(1) + .addPage(3) + .addPage(4) + .addPage(3) + .addPage(5) + .addPage(5) + .addPage(8) + .addPage(1) + .build(); + private static final ColumnIndex COLUMN3_CI = new CIBuilder(optional(DOUBLE).named("column3"), UNORDERED) + .addPage(0, 2.03, 2.03) + .addPage(0, 0.56, 8.71) + .addPage(2, 3.14, 3.50) + .addPage(0, 2.71, 9.99) + .addPage(3, 0.36, 5.32) + .addPage(0, 4.17, 7.95) + .addNullPage(4) + .build(); + private static final OffsetIndex COLUMN3_OI = new OIBuilder() + .addPage(1) + .addPage(5) + .addPage(4) + .addPage(6) + .addPage(7) + .addPage(3) + .addPage(4) + .build(); + private static final ColumnIndex COLUMN4_CI = null; + private static final OffsetIndex COLUMN4_OI = new OIBuilder() + .addPage(1) + .addPage(3) + .addPage(2) + .addPage(1) + .addPage(5) + .addPage(4) + .addPage(5) + .addPage(7) + .addPage(2) + .build(); + private static final ColumnIndexStore STORE = new ColumnIndexStore() { + @Override + public ColumnIndex getColumnIndex(ColumnPath column) { + switch (column.toDotString()) { + case "column1": + return COLUMN1_CI; + case "column2": + return COLUMN2_CI; + case "column3": + return COLUMN3_CI; + case "column4": + return COLUMN4_CI; + default: + return null; + } + } + + @Override + public OffsetIndex getOffsetIndex(ColumnPath column) { + switch (column.toDotString()) { + case "column1": + return COLUMN1_OI; + case "column2": + return COLUMN2_OI; + case "column3": + return COLUMN3_OI; + case "column4": + return COLUMN4_OI; + default: + throw new MissingOffsetIndexException(column); + } + } + }; + + private static Set paths(String... columns) { + Set paths = new HashSet<>(); + for (String column : columns) { + paths.add(ColumnPath.fromDotString(column)); + } + return paths; + } + + private static void assertAllRows(RowRanges ranges, long rowCount) { + LongList actualList = new LongArrayList(); + ranges.iterator().forEachRemaining((long value) -> actualList.add(value)); + LongList expectedList = new LongArrayList(); + LongStream.range(0, rowCount).forEach(expectedList::add); + assertArrayEquals(expectedList + " != " + actualList, expectedList.toLongArray(), actualList.toLongArray()); + } + + private static void assertRows(RowRanges ranges, long... expectedRows) { + LongList actualList = new LongArrayList(); + ranges.iterator().forEachRemaining((long value) -> actualList.add(value)); + assertArrayEquals(Arrays.toString(expectedRows) + " != " + actualList, expectedRows, actualList.toLongArray()); + } + + @Test + public void testFiltering() { + Set paths = paths("column1", "column2", "column3", "column4"); + + assertAllRows( + calculateRowRanges(FilterCompat.get( + userDefined(intColumn("column1"), AnyInt.class)), STORE, paths, TOTAL_ROW_COUNT), + TOTAL_ROW_COUNT); + assertRows(calculateRowRanges(FilterCompat.get( + and( + and( + eq(intColumn("column1"), null), + eq(binaryColumn("column2"), null)), + and( + eq(doubleColumn("column3"), null), + eq(booleanColumn("column4"), null)))), + STORE, paths, TOTAL_ROW_COUNT), + 6, 9); + assertRows(calculateRowRanges(FilterCompat.get( + and( + and( + notEq(intColumn("column1"), null), + notEq(binaryColumn("column2"), null)), + and( + notEq(doubleColumn("column3"), null), + notEq(booleanColumn("column4"), null)))), + STORE, paths, TOTAL_ROW_COUNT), + 0, 1, 2, 3, 4, 5, 6, 7, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25); + assertRows(calculateRowRanges(FilterCompat.get( + or( + and( + lt(intColumn("column1"), 20), + gtEq(binaryColumn("column2"), fromString("Quebec"))), + and( + gt(doubleColumn("column3"), 5.32), + ltEq(binaryColumn("column4"), fromString("XYZ"))))), + STORE, paths, TOTAL_ROW_COUNT), + 0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 12, 13, 14, 15, 23, 24, 25); + assertRows(calculateRowRanges(FilterCompat.get( + and( + and( + gtEq(intColumn("column1"), 7), + gt(binaryColumn("column2"), fromString("India"))), + and( + eq(doubleColumn("column3"), null), + notEq(binaryColumn("column4"), null)))), + STORE, paths, TOTAL_ROW_COUNT), + 7, 16, 17, 18, 19, 20); + assertRows(calculateRowRanges(FilterCompat.get( + and( + or( + invert(userDefined(intColumn("column1"), AnyInt.class)), + eq(binaryColumn("column2"), fromString("Echo"))), + eq(doubleColumn("column3"), 6.0))), + STORE, paths, TOTAL_ROW_COUNT), + 23, 24, 25); + assertRows(calculateRowRanges(FilterCompat.get( + and( + userDefined(intColumn("column1"), IntegerIsDivisableWith3.class), + and( + userDefined(binaryColumn("column2"), BinaryUtf8StartsWithB.class), + userDefined(doubleColumn("column3"), DoubleIsInteger.class)))), + STORE, paths, TOTAL_ROW_COUNT), + 21, 22, 23, 24, 25); + assertRows(calculateRowRanges(FilterCompat.get( + and( + and( + gtEq(intColumn("column1"), 7), + lt(intColumn("column1"), 11)), + and( + gt(binaryColumn("column2"), fromString("Romeo")), + ltEq(binaryColumn("column2"), fromString("Tango"))))), + STORE, paths, TOTAL_ROW_COUNT), + 7, 11, 12, 13); + } + + @Test + public void testFilteringOnMissingColumns() { + Set paths = paths("column1", "column2", "column3", "column4"); + + // Missing column filter is always true + assertAllRows(calculateRowRanges(FilterCompat.get( + notEq(intColumn("missing_column"), 0)), + STORE, paths, TOTAL_ROW_COUNT), + TOTAL_ROW_COUNT); + assertRows(calculateRowRanges(FilterCompat.get( + and( + and( + gtEq(intColumn("column1"), 7), + lt(intColumn("column1"), 11)), + eq(binaryColumn("missing_column"), null))), + STORE, paths, TOTAL_ROW_COUNT), + 7, 8, 9, 10, 11, 12, 13); + + // Missing column filter is always false + assertRows(calculateRowRanges(FilterCompat.get( + or( + and( + gtEq(intColumn("column1"), 7), + lt(intColumn("column1"), 11)), + notEq(binaryColumn("missing_column"), null))), + STORE, paths, TOTAL_ROW_COUNT), + 7, 8, 9, 10, 11, 12, 13); + assertRows(calculateRowRanges(FilterCompat.get( + gt(intColumn("missing_column"), 0)), + STORE, paths, TOTAL_ROW_COUNT)); + } + + @Test + public void testFilteringWithMissingOffsetIndex() { + Set paths = paths("column1", "column2", "column3", "column4", "column_wo_oi"); + + assertAllRows(calculateRowRanges(FilterCompat.get( + and( + and( + gtEq(intColumn("column1"), 7), + lt(intColumn("column1"), 11)), + and( + gt(binaryColumn("column2"), fromString("Romeo")), + ltEq(binaryColumn("column_wo_oi"), fromString("Tango"))))), + STORE, paths, TOTAL_ROW_COUNT), + TOTAL_ROW_COUNT); + } + +} diff --git a/parquet-column/src/test/java/org/apache/parquet/internal/filter2/columnindex/TestRowRanges.java b/parquet-column/src/test/java/org/apache/parquet/internal/filter2/columnindex/TestRowRanges.java new file mode 100644 index 0000000000..71b8844990 --- /dev/null +++ b/parquet-column/src/test/java/org/apache/parquet/internal/filter2/columnindex/TestRowRanges.java @@ -0,0 +1,155 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.internal.filter2.columnindex; + +import static org.apache.parquet.internal.filter2.columnindex.RowRanges.intersection; +import static org.apache.parquet.internal.filter2.columnindex.RowRanges.union; +import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +import java.util.Arrays; +import java.util.PrimitiveIterator; + +import org.apache.parquet.internal.column.columnindex.OffsetIndexBuilder; +import org.junit.Test; + +import it.unimi.dsi.fastutil.longs.LongArrayList; +import it.unimi.dsi.fastutil.longs.LongList; + +/** + * Unit test for {@link RowRanges} + */ +public class TestRowRanges { + private static RowRanges buildRanges(long... rowIndexes) { + if (rowIndexes.length == 0) { + return RowRanges.EMPTY; + } + OffsetIndexBuilder builder = OffsetIndexBuilder.getBuilder(); + for (int i = 0, n = rowIndexes.length; i < n; i += 2) { + long from = rowIndexes[i]; + long to = rowIndexes[i + 1]; + builder.add(0, 0, from); + builder.add(0, 0, to + 1); + } + PrimitiveIterator.OfInt pageIndexes = new PrimitiveIterator.OfInt() { + private int index = 0; + + @Override + public boolean hasNext() { + return index < rowIndexes.length; + } + + @Override + public int nextInt() { + int ret = index; + index += 2; + return ret; + } + }; + return RowRanges.create(rowIndexes[rowIndexes.length - 1], pageIndexes, builder.build()); + } + + private static void assertAllRowsEqual(PrimitiveIterator.OfLong actualIt, long... expectedValues) { + LongList actualList = new LongArrayList(); + actualIt.forEachRemaining((long value) -> actualList.add(value)); + assertArrayEquals(Arrays.toString(expectedValues) + "!= " + actualList, expectedValues, actualList.toLongArray()); + } + + @Test + public void testCreate() { + RowRanges ranges = buildRanges( + 1, 2, + 3, 4, + 6, 7, + 7, 10, + 15, 17); + assertAllRowsEqual(ranges.iterator(), 1, 2, 3, 4, 6, 7, 8, 9, 10, 15, 16, 17); + assertEquals(12, ranges.rowCount()); + assertTrue(ranges.isOverlapping(4, 5)); + assertFalse(ranges.isOverlapping(5, 5)); + assertTrue(ranges.isOverlapping(10, 14)); + assertFalse(ranges.isOverlapping(11, 14)); + assertFalse(ranges.isOverlapping(18, Long.MAX_VALUE)); + + ranges = RowRanges.createSingle(5); + assertAllRowsEqual(ranges.iterator(), 0, 1, 2, 3, 4); + assertEquals(5, ranges.rowCount()); + assertTrue(ranges.isOverlapping(0, 100)); + assertFalse(ranges.isOverlapping(5, Long.MAX_VALUE)); + + ranges = RowRanges.EMPTY; + assertAllRowsEqual(ranges.iterator()); + assertEquals(0, ranges.rowCount()); + assertFalse(ranges.isOverlapping(0, Long.MAX_VALUE)); + } + + @Test + public void testUnion() { + RowRanges ranges1 = buildRanges( + 2, 5, + 7, 9, + 14, 14, + 20, 24); + RowRanges ranges2 = buildRanges( + 1, 2, + 4, 5, + 11, 12, + 14, 15, + 21, 22); + RowRanges empty = buildRanges(); + assertAllRowsEqual(union(ranges1, ranges2).iterator(), 1, 2, 3, 4, 5, 7, 8, 9, 11, 12, 14, 15, 20, 21, 22, 23, 24); + assertAllRowsEqual(union(ranges2, ranges1).iterator(), 1, 2, 3, 4, 5, 7, 8, 9, 11, 12, 14, 15, 20, 21, 22, 23, 24); + assertAllRowsEqual(union(ranges1, ranges1).iterator(), 2, 3, 4, 5, 7, 8, 9, 14, 20, 21, 22, 23, 24); + assertAllRowsEqual(union(ranges1, empty).iterator(), 2, 3, 4, 5, 7, 8, 9, 14, 20, 21, 22, 23, 24); + assertAllRowsEqual(union(empty, ranges1).iterator(), 2, 3, 4, 5, 7, 8, 9, 14, 20, 21, 22, 23, 24); + assertAllRowsEqual(union(ranges2, ranges2).iterator(), 1, 2, 4, 5, 11, 12, 14, 15, 21, 22); + assertAllRowsEqual(union(ranges2, empty).iterator(), 1, 2, 4, 5, 11, 12, 14, 15, 21, 22); + assertAllRowsEqual(union(empty, ranges2).iterator(), 1, 2, 4, 5, 11, 12, 14, 15, 21, 22); + assertAllRowsEqual(union(empty, empty).iterator()); + } + + @Test + public void testIntersection() { + RowRanges ranges1 = buildRanges( + 2, 5, + 7, 9, + 14, 14, + 20, 24); + RowRanges ranges2 = buildRanges( + 1, 2, + 6, 7, + 9, 9, + 11, 12, + 14, 15, + 21, 22); + RowRanges empty = buildRanges(); + assertAllRowsEqual(intersection(ranges1, ranges2).iterator(), 2, 7, 9, 14, 21, 22); + assertAllRowsEqual(intersection(ranges2, ranges1).iterator(), 2, 7, 9, 14, 21, 22); + assertAllRowsEqual(intersection(ranges1, ranges1).iterator(), 2, 3, 4, 5, 7, 8, 9, 14, 20, 21, 22, 23, 24); + assertAllRowsEqual(intersection(ranges1, empty).iterator()); + assertAllRowsEqual(intersection(empty, ranges1).iterator()); + assertAllRowsEqual(intersection(ranges2, ranges2).iterator(), 1, 2, 6, 7, 9, 11, 12, 14, 15, 21, 22); + assertAllRowsEqual(intersection(ranges2, empty).iterator()); + assertAllRowsEqual(intersection(empty, ranges2).iterator()); + assertAllRowsEqual(intersection(empty, empty).iterator()); + } + +} diff --git a/parquet-column/src/test/java/org/apache/parquet/parser/TestParquetParser.java b/parquet-column/src/test/java/org/apache/parquet/parser/TestParquetParser.java index d8536012ba..fa200ab424 100644 --- a/parquet-column/src/test/java/org/apache/parquet/parser/TestParquetParser.java +++ b/parquet-column/src/test/java/org/apache/parquet/parser/TestParquetParser.java @@ -19,6 +19,7 @@ package org.apache.parquet.parser; import static org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit.MILLIS; +import static org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit.NANOS; import static org.apache.parquet.schema.LogicalTypeAnnotation.intType; import static org.apache.parquet.schema.LogicalTypeAnnotation.stringType; import static org.apache.parquet.schema.LogicalTypeAnnotation.timeType; @@ -271,7 +272,9 @@ public void testTimeAnnotations() { " required int64 timestamp (TIMESTAMP_MILLIS);" + " required FIXED_LEN_BYTE_ARRAY(12) interval (INTERVAL);" + " required int32 newTime (TIME(MILLIS,true));" + + " required int64 nanoTime (TIME(NANOS,true));" + " required int64 newTimestamp (TIMESTAMP(MILLIS,false));" + + " required int64 nanoTimestamp (TIMESTAMP(NANOS,false));" + "}\n"; MessageType parsed = MessageTypeParser.parseMessageType(message); @@ -281,7 +284,9 @@ public void testTimeAnnotations() { .required(INT64).as(TIMESTAMP_MILLIS).named("timestamp") .required(FIXED_LEN_BYTE_ARRAY).length(12).as(INTERVAL).named("interval") .required(INT32).as(timeType(true, MILLIS)).named("newTime") + .required(INT64).as(timeType(true, NANOS)).named("nanoTime") .required(INT64).as(timestampType(false, MILLIS)).named("newTimestamp") + .required(INT64).as(timestampType(false, NANOS)).named("nanoTimestamp") .named("TimeMessage"); assertEquals(expected, parsed); diff --git a/parquet-column/src/test/java/org/apache/parquet/schema/TestMessageType.java b/parquet-column/src/test/java/org/apache/parquet/schema/TestMessageType.java index 05619385bc..e511d4252f 100644 --- a/parquet-column/src/test/java/org/apache/parquet/schema/TestMessageType.java +++ b/parquet-column/src/test/java/org/apache/parquet/schema/TestMessageType.java @@ -148,7 +148,7 @@ public void testMergeSchema() { t9.union(t10); fail("moving from BINARY (UTF8) to BINARY"); } catch (IncompatibleSchemaModificationException e) { - assertEquals("cannot merge original type null into UTF8", e.getMessage()); + assertEquals("cannot merge logical type null into STRING", e.getMessage()); } MessageType t11 = Types.buildMessage() diff --git a/parquet-column/src/test/java/org/apache/parquet/schema/TestPrimitiveComparator.java b/parquet-column/src/test/java/org/apache/parquet/schema/TestPrimitiveComparator.java index 3f9d6431b5..0bf3599419 100644 --- a/parquet-column/src/test/java/org/apache/parquet/schema/TestPrimitiveComparator.java +++ b/parquet-column/src/test/java/org/apache/parquet/schema/TestPrimitiveComparator.java @@ -23,6 +23,8 @@ import java.math.BigInteger; import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.List; import static org.apache.parquet.schema.PrimitiveComparator.BOOLEAN_COMPARATOR; import static org.apache.parquet.schema.PrimitiveComparator.DOUBLE_COMPARATOR; @@ -249,6 +251,23 @@ public void testBinaryAsSignedIntegerComparator() { ByteBuffer.wrap(new BigInteger("9999999999999999999999999999999999999999").toByteArray()))); } + @Test + public void testBinaryAsSignedIntegerComparatorWithEquals() { + List valuesToCompare = new ArrayList<>(); + valuesToCompare.add(Binary.fromConstantByteBuffer(ByteBuffer.wrap(new byte[] { 0, 0, -108 }))); + valuesToCompare.add(Binary.fromConstantByteBuffer(ByteBuffer.wrap(new byte[] { 0, 0, 0, 0, 0, -108 }))); + valuesToCompare.add(Binary.fromConstantByteBuffer(ByteBuffer.wrap(new byte[] { 0, 0, 0, -108 }))); + valuesToCompare.add(Binary.fromConstantByteBuffer(ByteBuffer.wrap(new byte[] { 0, 0, 0, 0, -108 }))); + valuesToCompare.add(Binary.fromConstantByteBuffer(ByteBuffer.wrap(new byte[] { 0, -108 }))); + + for (Binary v1 : valuesToCompare) { + for (Binary v2 : valuesToCompare) { + assertEquals(String.format("Wrong result of comparison %s and %s", v1, v2), + 0, BINARY_AS_SIGNED_INTEGER_COMPARATOR.compare(v1, v2)); + } + } + } + private void testObjectComparator(PrimitiveComparator comparator, T... valuesInAscendingOrder) { for (int i = 0; i < valuesInAscendingOrder.length; ++i) { for (int j = 0; j < valuesInAscendingOrder.length; ++j) { diff --git a/parquet-column/src/test/java/org/apache/parquet/schema/TestPrimitiveStringifier.java b/parquet-column/src/test/java/org/apache/parquet/schema/TestPrimitiveStringifier.java index 53045cfb8c..b5de4f850e 100644 --- a/parquet-column/src/test/java/org/apache/parquet/schema/TestPrimitiveStringifier.java +++ b/parquet-column/src/test/java/org/apache/parquet/schema/TestPrimitiveStringifier.java @@ -19,15 +19,26 @@ package org.apache.parquet.schema; import static java.nio.charset.StandardCharsets.UTF_8; +import static java.util.Arrays.asList; import static java.util.concurrent.TimeUnit.HOURS; import static java.util.concurrent.TimeUnit.MICROSECONDS; import static java.util.concurrent.TimeUnit.MILLISECONDS; import static java.util.concurrent.TimeUnit.MINUTES; +import static java.util.concurrent.TimeUnit.NANOSECONDS; import static java.util.concurrent.TimeUnit.SECONDS; import static org.apache.parquet.schema.PrimitiveStringifier.DATE_STRINGIFIER; import static org.apache.parquet.schema.PrimitiveStringifier.DEFAULT_STRINGIFIER; import static org.apache.parquet.schema.PrimitiveStringifier.INTERVAL_STRINGIFIER; +import static org.apache.parquet.schema.PrimitiveStringifier.TIMESTAMP_MICROS_STRINGIFIER; +import static org.apache.parquet.schema.PrimitiveStringifier.TIMESTAMP_MICROS_UTC_STRINGIFIER; +import static org.apache.parquet.schema.PrimitiveStringifier.TIMESTAMP_MILLIS_STRINGIFIER; +import static org.apache.parquet.schema.PrimitiveStringifier.TIMESTAMP_MILLIS_UTC_STRINGIFIER; +import static org.apache.parquet.schema.PrimitiveStringifier.TIMESTAMP_NANOS_STRINGIFIER; +import static org.apache.parquet.schema.PrimitiveStringifier.TIMESTAMP_NANOS_UTC_STRINGIFIER; +import static org.apache.parquet.schema.PrimitiveStringifier.TIME_NANOS_STRINGIFIER; +import static org.apache.parquet.schema.PrimitiveStringifier.TIME_NANOS_UTC_STRINGIFIER; import static org.apache.parquet.schema.PrimitiveStringifier.TIME_STRINGIFIER; +import static org.apache.parquet.schema.PrimitiveStringifier.TIME_UTC_STRINGIFIER; import static org.apache.parquet.schema.PrimitiveStringifier.UNSIGNED_STRINGIFIER; import static org.apache.parquet.schema.PrimitiveStringifier.UTF8_STRINGIFIER; import static org.junit.Assert.assertEquals; @@ -35,7 +46,6 @@ import java.math.BigInteger; import java.nio.ByteBuffer; -import java.util.Arrays; import java.util.Calendar; import java.util.HashSet; import java.util.Set; @@ -152,74 +162,124 @@ public void testDateStringifier() { assertEquals("2017-12-14", stringifier.stringify((int) MILLISECONDS.toDays(cal.getTimeInMillis()))); cal.clear(); - cal.set(1492, Calendar.AUGUST, 3); - assertEquals("1492-08-03", stringifier.stringify((int) MILLISECONDS.toDays(cal.getTimeInMillis()))); + cal.set(1583, Calendar.AUGUST, 3); + assertEquals("1583-08-03", stringifier.stringify((int) MILLISECONDS.toDays(cal.getTimeInMillis()))); checkThrowingUnsupportedException(stringifier, Integer.TYPE); } @Test public void testTimestampMillisStringifier() { - PrimitiveStringifier stringifier = PrimitiveStringifier.TIMESTAMP_MILLIS_STRINGIFIER; + for (PrimitiveStringifier stringifier : asList(TIMESTAMP_MILLIS_STRINGIFIER, TIMESTAMP_MILLIS_UTC_STRINGIFIER)) { + String timezoneAmendment = (stringifier == TIMESTAMP_MILLIS_STRINGIFIER ? "" : "+0000"); - assertEquals("1970-01-01T00:00:00.000", stringifier.stringify(0l)); + assertEquals(withZoneString("1970-01-01T00:00:00.000", timezoneAmendment), stringifier.stringify(0l)); - Calendar cal = Calendar.getInstance(UTC); - cal.clear(); - cal.set(2017, Calendar.DECEMBER, 15, 10, 9, 54); - cal.set(Calendar.MILLISECOND, 120); - assertEquals("2017-12-15T10:09:54.120", stringifier.stringify(cal.getTimeInMillis())); + Calendar cal = Calendar.getInstance(UTC); + cal.clear(); + cal.set(2017, Calendar.DECEMBER, 15, 10, 9, 54); + cal.set(Calendar.MILLISECOND, 120); + assertEquals(withZoneString("2017-12-15T10:09:54.120", timezoneAmendment), stringifier.stringify(cal.getTimeInMillis())); - cal.clear(); - cal.set(1948, Calendar.NOVEMBER, 23, 20, 19, 1); - cal.set(Calendar.MILLISECOND, 9); - assertEquals("1948-11-23T20:19:01.009", stringifier.stringify(cal.getTimeInMillis())); + cal.clear(); + cal.set(1948, Calendar.NOVEMBER, 23, 20, 19, 1); + cal.set(Calendar.MILLISECOND, 9); + assertEquals(withZoneString("1948-11-23T20:19:01.009", timezoneAmendment), stringifier.stringify(cal.getTimeInMillis())); - checkThrowingUnsupportedException(stringifier, Long.TYPE); + checkThrowingUnsupportedException(stringifier, Long.TYPE); + } } @Test public void testTimestampMicrosStringifier() { - PrimitiveStringifier stringifier = PrimitiveStringifier.TIMESTAMP_MICROS_STRINGIFIER; + for (PrimitiveStringifier stringifier : asList(TIMESTAMP_MICROS_STRINGIFIER, TIMESTAMP_MICROS_UTC_STRINGIFIER)) { + String timezoneAmendment = (stringifier == TIMESTAMP_MICROS_STRINGIFIER ? "" : "+0000"); - assertEquals("1970-01-01T00:00:00.000000", stringifier.stringify(0l)); + assertEquals(withZoneString("1970-01-01T00:00:00.000000", timezoneAmendment), stringifier.stringify(0l)); - Calendar cal = Calendar.getInstance(UTC); - cal.clear(); - cal.set(2053, Calendar.JULY, 10, 22, 13, 24); - cal.set(Calendar.MILLISECOND, 84); - long micros = cal.getTimeInMillis() * 1000 + 900; - assertEquals("2053-07-10T22:13:24.084900", stringifier.stringify(micros)); + Calendar cal = Calendar.getInstance(UTC); + cal.clear(); + cal.set(2053, Calendar.JULY, 10, 22, 13, 24); + cal.set(Calendar.MILLISECOND, 84); + long micros = cal.getTimeInMillis() * 1000 + 900; + assertEquals(withZoneString("2053-07-10T22:13:24.084900", timezoneAmendment), stringifier.stringify(micros)); - cal.clear(); - cal.set(1848, Calendar.MARCH, 15, 9, 23, 59); - cal.set(Calendar.MILLISECOND, 765); - micros = cal.getTimeInMillis() * 1000 - 1; - assertEquals("1848-03-15T09:23:59.765001", stringifier.stringify(micros)); + cal.clear(); + cal.set(1848, Calendar.MARCH, 15, 9, 23, 59); + cal.set(Calendar.MILLISECOND, 765); + micros = cal.getTimeInMillis() * 1000 - 1; + assertEquals(withZoneString("1848-03-15T09:23:59.764999", timezoneAmendment), stringifier.stringify(micros)); + + checkThrowingUnsupportedException(stringifier, Long.TYPE); + } + } - checkThrowingUnsupportedException(stringifier, Long.TYPE); + @Test + public void testTimestampNanosStringifier() { + for (PrimitiveStringifier stringifier : asList(TIMESTAMP_NANOS_STRINGIFIER, TIMESTAMP_NANOS_UTC_STRINGIFIER)) { + String timezoneAmendment = (stringifier == TIMESTAMP_NANOS_STRINGIFIER ? "" : "+0000"); + + assertEquals(withZoneString("1970-01-01T00:00:00.000000000", timezoneAmendment), stringifier.stringify(0l)); + + Calendar cal = Calendar.getInstance(UTC); + cal.clear(); + cal.set(2053, Calendar.JULY, 10, 22, 13, 24); + cal.set(Calendar.MILLISECOND, 84); + long nanos = cal.getTimeInMillis() * 1_000_000 + 536; + assertEquals(withZoneString("2053-07-10T22:13:24.084000536", timezoneAmendment), stringifier.stringify(nanos)); + + cal.clear(); + cal.set(1848, Calendar.MARCH, 15, 9, 23, 59); + cal.set(Calendar.MILLISECOND, 765); + nanos = cal.getTimeInMillis() * 1_000_000 - 1; + assertEquals(withZoneString("1848-03-15T09:23:59.764999999", timezoneAmendment), stringifier.stringify(nanos)); + + checkThrowingUnsupportedException(stringifier, Long.TYPE); + } } @Test public void testTimeStringifier() { - PrimitiveStringifier stringifier = TIME_STRINGIFIER; + for (PrimitiveStringifier stringifier : asList(TIME_STRINGIFIER, TIME_UTC_STRINGIFIER)) { + String timezoneAmendment = (stringifier == TIME_STRINGIFIER ? "" : "+0000"); - assertEquals("00:00:00.000", stringifier.stringify(0)); - assertEquals("00:00:00.000000", stringifier.stringify(0l)); + assertEquals(withZoneString("00:00:00.000", timezoneAmendment), stringifier.stringify(0)); + assertEquals(withZoneString("00:00:00.000000", timezoneAmendment), stringifier.stringify(0l)); - assertEquals("12:34:56.789", stringifier.stringify((int) convert(MILLISECONDS, 12, 34, 56, 789))); - assertEquals("12:34:56.789012", stringifier.stringify(convert(MICROSECONDS, 12, 34, 56, 789012))); + assertEquals(withZoneString("12:34:56.789", timezoneAmendment), stringifier.stringify((int) convert(MILLISECONDS, 12, 34, 56, 789))); + assertEquals(withZoneString("12:34:56.789012", timezoneAmendment), stringifier.stringify(convert(MICROSECONDS, 12, 34, 56, 789012))); - assertEquals("-12:34:56.789", stringifier.stringify((int) convert(MILLISECONDS, -12, -34, -56, -789))); - assertEquals("-12:34:56.789012", stringifier.stringify(convert(MICROSECONDS, -12, -34, -56, -789012))); + assertEquals(withZoneString("-12:34:56.789", timezoneAmendment), stringifier.stringify((int) convert(MILLISECONDS, -12, -34, -56, -789))); + assertEquals(withZoneString("-12:34:56.789012", timezoneAmendment), stringifier.stringify(convert(MICROSECONDS, -12, -34, -56, -789012))); - assertEquals("123:12:34.567", stringifier.stringify((int) convert(MILLISECONDS, 123, 12, 34, 567))); - assertEquals("12345:12:34.056789", stringifier.stringify(convert(MICROSECONDS, 12345, 12, 34, 56789))); + assertEquals(withZoneString("123:12:34.567", timezoneAmendment), stringifier.stringify((int) convert(MILLISECONDS, 123, 12, 34, 567))); + assertEquals(withZoneString("12345:12:34.056789", timezoneAmendment), stringifier.stringify(convert(MICROSECONDS, 12345, 12, 34, 56789))); - assertEquals("-123:12:34.567", stringifier.stringify((int) convert(MILLISECONDS, -123, -12, -34, -567))); - assertEquals("-12345:12:34.056789", stringifier.stringify(convert(MICROSECONDS, -12345, -12, -34, -56789))); + assertEquals(withZoneString("-123:12:34.567", timezoneAmendment), stringifier.stringify((int) convert(MILLISECONDS, -123, -12, -34, -567))); + assertEquals(withZoneString("-12345:12:34.056789", timezoneAmendment), stringifier.stringify(convert(MICROSECONDS, -12345, -12, -34, -56789))); - checkThrowingUnsupportedException(stringifier, Integer.TYPE, Long.TYPE); + checkThrowingUnsupportedException(stringifier, Integer.TYPE, Long.TYPE); + } + } + + @Test + public void testTimeNanoStringifier() { + for (PrimitiveStringifier stringifier : asList(TIME_NANOS_STRINGIFIER, TIME_NANOS_UTC_STRINGIFIER)) { + String timezoneAmendment = (stringifier == TIME_NANOS_STRINGIFIER ? "" : "+0000"); + + assertEquals(withZoneString("00:00:00.000000000", timezoneAmendment), stringifier.stringify(0l)); + + assertEquals(withZoneString("12:34:56.789012987", timezoneAmendment), stringifier.stringify(convert(NANOSECONDS, 12, 34, 56, 789012987))); + assertEquals(withZoneString("-12:34:56.000789012", timezoneAmendment), stringifier.stringify(convert(NANOSECONDS, -12, -34, -56, -789012))); + assertEquals(withZoneString("12345:12:34.000056789", timezoneAmendment), stringifier.stringify(convert(NANOSECONDS, 12345, 12, 34, 56789))); + assertEquals(withZoneString("-12345:12:34.000056789", timezoneAmendment), stringifier.stringify(convert(NANOSECONDS, -12345, -12, -34, -56789))); + + checkThrowingUnsupportedException(stringifier, Integer.TYPE, Long.TYPE); + } + } + + private String withZoneString(String expected, String zoneString) { + return expected + zoneString; } private long convert(TimeUnit unit, long hours, long minutes, long seconds, long rest) { @@ -250,7 +310,7 @@ public void testDecimalStringifier() { } private void checkThrowingUnsupportedException(PrimitiveStringifier stringifier, Class... excludes) { - Set> set = new HashSet<>(Arrays.asList(excludes)); + Set> set = new HashSet<>(asList(excludes)); if (!set.contains(Integer.TYPE)) { try { stringifier.stringify(0); diff --git a/parquet-column/src/test/java/org/apache/parquet/schema/TestTypeBuildersWithLogicalTypes.java b/parquet-column/src/test/java/org/apache/parquet/schema/TestTypeBuildersWithLogicalTypes.java new file mode 100644 index 0000000000..fe13e604b6 --- /dev/null +++ b/parquet-column/src/test/java/org/apache/parquet/schema/TestTypeBuildersWithLogicalTypes.java @@ -0,0 +1,408 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.schema; + +import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; +import org.apache.parquet.schema.Type.Repetition; +import org.junit.Assert; +import org.junit.Test; + +import java.util.concurrent.Callable; + +import static org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit.MICROS; +import static org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit.MILLIS; +import static org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit.NANOS; +import static org.apache.parquet.schema.LogicalTypeAnnotation.bsonType; +import static org.apache.parquet.schema.LogicalTypeAnnotation.dateType; +import static org.apache.parquet.schema.LogicalTypeAnnotation.decimalType; +import static org.apache.parquet.schema.LogicalTypeAnnotation.intType; +import static org.apache.parquet.schema.LogicalTypeAnnotation.jsonType; +import static org.apache.parquet.schema.LogicalTypeAnnotation.stringType; +import static org.apache.parquet.schema.LogicalTypeAnnotation.timeType; +import static org.apache.parquet.schema.LogicalTypeAnnotation.timestampType; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BOOLEAN; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.DOUBLE; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FLOAT; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT64; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT96; +import static org.apache.parquet.schema.Type.Repetition.REQUIRED; + +public class TestTypeBuildersWithLogicalTypes { + @Test + public void testGroupTypeConstruction() { + PrimitiveType f1 = Types.required(BINARY).as(stringType()).named("f1"); + PrimitiveType f2 = Types.required(INT32).named("f2"); + PrimitiveType f3 = Types.optional(INT32).named("f3"); + String name = "group"; + for (Repetition repetition : Repetition.values()) { + GroupType expected = new GroupType(repetition, name, + f1, + new GroupType(repetition, "g1", f2, f3)); + GroupType built = Types.buildGroup(repetition) + .addField(f1) + .group(repetition).addFields(f2, f3).named("g1") + .named(name); + Assert.assertEquals(expected, built); + + switch (repetition) { + case REQUIRED: + built = Types.requiredGroup() + .addField(f1) + .requiredGroup().addFields(f2, f3).named("g1") + .named(name); + break; + case OPTIONAL: + built = Types.optionalGroup() + .addField(f1) + .optionalGroup().addFields(f2, f3).named("g1") + .named(name); + break; + case REPEATED: + built = Types.repeatedGroup() + .addField(f1) + .repeatedGroup().addFields(f2, f3).named("g1") + .named(name); + break; + } + Assert.assertEquals(expected, built); + } + } + + @Test + public void testDecimalAnnotation() { + // int32 primitive type + MessageType expected = new MessageType("DecimalMessage", + new PrimitiveType(REQUIRED, INT32, 0, "aDecimal", + decimalType(2, 9), null)); + MessageType builderType = Types.buildMessage() + .required(INT32) + .as(decimalType(2, 9)) + .named("aDecimal") + .named("DecimalMessage"); + Assert.assertEquals(expected, builderType); + // int64 primitive type + expected = new MessageType("DecimalMessage", + new PrimitiveType(REQUIRED, INT64, 0, "aDecimal", + decimalType(2, 18), null)); + builderType = Types.buildMessage() + .required(INT64) + .as(decimalType(2, 18)).precision(18).scale(2) + .named("aDecimal") + .named("DecimalMessage"); + Assert.assertEquals(expected, builderType); + // binary primitive type + expected = new MessageType("DecimalMessage", + new PrimitiveType(REQUIRED, BINARY, 0, "aDecimal", + decimalType(2, 9), null)); + builderType = Types.buildMessage() + .required(BINARY).as(decimalType(2, 9)) + .named("aDecimal") + .named("DecimalMessage"); + Assert.assertEquals(expected, builderType); + // fixed primitive type + expected = new MessageType("DecimalMessage", + new PrimitiveType(REQUIRED, FIXED_LEN_BYTE_ARRAY, 4, "aDecimal", + decimalType(2, 9), null)); + builderType = Types.buildMessage() + .required(FIXED_LEN_BYTE_ARRAY).length(4) + .as(decimalType(2, 9)) + .named("aDecimal") + .named("DecimalMessage"); + Assert.assertEquals(expected, builderType); + } + + @Test + public void testDecimalAnnotationPrecisionScaleBound() { + assertThrows("Should reject scale greater than precision", + IllegalArgumentException.class, () -> Types.buildMessage() + .required(INT32).as(decimalType(4, 3)) + .named("aDecimal") + .named("DecimalMessage")); + assertThrows("Should reject scale greater than precision", + IllegalArgumentException.class, () -> Types.buildMessage() + .required(INT64).as(decimalType(4, 3)) + .named("aDecimal") + .named("DecimalMessage")); + assertThrows("Should reject scale greater than precision", + IllegalArgumentException.class, () -> Types.buildMessage() + .required(BINARY).as(decimalType(4, 3)) + .named("aDecimal") + .named("DecimalMessage")); + assertThrows("Should reject scale greater than precision", + IllegalArgumentException.class, () -> Types.buildMessage() + .required(FIXED_LEN_BYTE_ARRAY).length(7) + .as(decimalType(4, 3)) + .named("aDecimal") + .named("DecimalMessage") + ); + } + + @Test + public void testDecimalAnnotationLengthCheck() { + // maximum precision for 4 bytes is 9 + assertThrows("should reject precision 10 with length 4", + IllegalStateException.class, () -> Types.required(FIXED_LEN_BYTE_ARRAY).length(4) + .as(decimalType(2, 10)) + .named("aDecimal")); + assertThrows("should reject precision 10 with length 4", + IllegalStateException.class, () -> Types.required(INT32) + .as(decimalType(2, 10)) + .named("aDecimal")); + // maximum precision for 8 bytes is 19 + assertThrows("should reject precision 19 with length 8", + IllegalStateException.class, () -> Types.required(FIXED_LEN_BYTE_ARRAY).length(8) + .as(decimalType(4, 19)) + .named("aDecimal")); + assertThrows("should reject precision 19 with length 8", + IllegalStateException.class, () -> Types.required(INT64).length(8) + .as(decimalType(4, 19)) + .named("aDecimal") + ); + } + + @Test + public void testDECIMALAnnotationRejectsUnsupportedTypes() { + PrimitiveTypeName[] unsupported = new PrimitiveTypeName[]{ + BOOLEAN, INT96, DOUBLE, FLOAT + }; + for (final PrimitiveTypeName type : unsupported) { + assertThrows("Should reject non-binary type: " + type, + IllegalStateException.class, () -> Types.required(type) + .as(decimalType(2, 9)) + .named("d")); + } + } + + @Test + public void testBinaryAnnotations() { + LogicalTypeAnnotation[] types = new LogicalTypeAnnotation[] { + stringType(), jsonType(), bsonType()}; + for (final LogicalTypeAnnotation logicalType : types) { + PrimitiveType expected = new PrimitiveType(REQUIRED, BINARY, "col", logicalType); + PrimitiveType string = Types.required(BINARY).as(logicalType).named("col"); + Assert.assertEquals(expected, string); + } + } + + @Test + public void testBinaryAnnotationsRejectsNonBinary() { + LogicalTypeAnnotation[] types = new LogicalTypeAnnotation[] { + stringType(), jsonType(), bsonType()}; + for (final LogicalTypeAnnotation logicalType : types) { + PrimitiveTypeName[] nonBinary = new PrimitiveTypeName[]{ + BOOLEAN, INT32, INT64, INT96, DOUBLE, FLOAT + }; + for (final PrimitiveTypeName type : nonBinary) { + assertThrows("Should reject non-binary type: " + type, + IllegalStateException.class, () -> Types.required(type).as(logicalType).named("col")); + } + assertThrows("Should reject non-binary type: FIXED_LEN_BYTE_ARRAY", + IllegalStateException.class, () -> Types.required(FIXED_LEN_BYTE_ARRAY).length(1) + .as(logicalType).named("col")); + } + } + + @Test + public void testInt32Annotations() { + LogicalTypeAnnotation[] types = new LogicalTypeAnnotation[] { + dateType(), timeType(true, MILLIS), timeType(false, MILLIS), + intType(8, false), intType(16, false), intType(32, false), + intType(8, true), intType(16, true), intType(32, true)}; + for (LogicalTypeAnnotation logicalType : types) { + PrimitiveType expected = new PrimitiveType(REQUIRED, INT32, "col", logicalType); + PrimitiveType date = Types.required(INT32).as(logicalType).named("col"); + Assert.assertEquals(expected, date); + } + } + + @Test + public void testInt32AnnotationsRejectNonInt32() { + LogicalTypeAnnotation[] types = new LogicalTypeAnnotation[] { + dateType(), timeType(true, MILLIS), timeType(false, MILLIS), + intType(8, false), intType(16, false), intType(32, false), + intType(8, true), intType(16, true), intType(32, true)}; + for (final LogicalTypeAnnotation logicalType : types) { + PrimitiveTypeName[] nonInt32 = new PrimitiveTypeName[]{ + BOOLEAN, INT64, INT96, DOUBLE, FLOAT, BINARY + }; + for (final PrimitiveTypeName type : nonInt32) { + assertThrows("Should reject non-int32 type: " + type, + IllegalStateException.class, () -> Types.required(type).as(logicalType).named("col")); + } + assertThrows("Should reject non-int32 type: FIXED_LEN_BYTE_ARRAY", + IllegalStateException.class, () -> Types.required(FIXED_LEN_BYTE_ARRAY).length(1) + .as(logicalType).named("col")); + } + } + + @Test + public void testInt64Annotations() { + LogicalTypeAnnotation[] types = new LogicalTypeAnnotation[] { + timeType(true, MICROS), timeType(false, MICROS), + timeType(true, NANOS), timeType(false, NANOS), + timestampType(true, MILLIS), timestampType(false, MILLIS), + timestampType(true, MICROS), timestampType(false, MICROS), + timestampType(true, NANOS), timestampType(false, NANOS), + intType(64, true), intType(64, false)}; + for (LogicalTypeAnnotation logicalType : types) { + PrimitiveType expected = new PrimitiveType(REQUIRED, INT64, "col", logicalType); + PrimitiveType date = Types.required(INT64).as(logicalType).named("col"); + Assert.assertEquals(expected, date); + } + } + + @Test + public void testInt64AnnotationsRejectNonInt64() { + LogicalTypeAnnotation[] types = new LogicalTypeAnnotation[] { + timeType(true, MICROS), timeType(false, MICROS), + timeType(true, NANOS), timeType(false, NANOS), + timestampType(true, MILLIS), timestampType(false, MILLIS), + timestampType(true, MICROS), timestampType(false, MICROS), + timestampType(true, NANOS), timestampType(false, NANOS), + intType(64, true), intType(64, false)}; + for (final LogicalTypeAnnotation logicalType : types) { + PrimitiveTypeName[] nonInt64 = new PrimitiveTypeName[]{ + BOOLEAN, INT32, INT96, DOUBLE, FLOAT, BINARY + }; + for (final PrimitiveTypeName type : nonInt64) { + assertThrows("Should reject non-int64 type: " + type, + IllegalStateException.class, (Callable) () -> Types.required(type).as(logicalType).named("col")); + } + assertThrows("Should reject non-int64 type: FIXED_LEN_BYTE_ARRAY", + IllegalStateException.class, (Callable) () -> Types.required(FIXED_LEN_BYTE_ARRAY).length(1) + .as(logicalType).named("col")); + } + } + + @Test + public void testIntervalAnnotationRejectsNonFixed() { + PrimitiveTypeName[] nonFixed = new PrimitiveTypeName[]{ + BOOLEAN, INT32, INT64, INT96, DOUBLE, FLOAT, BINARY + }; + for (final PrimitiveTypeName type : nonFixed) { + assertThrows("Should reject non-fixed type: " + type, + IllegalStateException.class, () -> Types.required(type) + .as(LogicalTypeAnnotation.IntervalLogicalTypeAnnotation.getInstance()).named("interval")); + } + } + + @Test + public void testIntervalAnnotationRejectsNonFixed12() { + assertThrows("Should reject fixed with length != 12: " + 11, + IllegalStateException.class, () -> Types.required(FIXED_LEN_BYTE_ARRAY).length(11) + .as(LogicalTypeAnnotation.IntervalLogicalTypeAnnotation.getInstance()).named("interval")); + } + + @Test + public void testTypeConstructionWithUnsupportedColumnOrder() { + assertThrows(null, IllegalArgumentException.class, + () -> Types.optional(INT96).columnOrder(ColumnOrder.typeDefined()).named("int96_unsupported")); + assertThrows(null, IllegalArgumentException.class, + () -> Types.optional(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY).length(12) + .as(LogicalTypeAnnotation.IntervalLogicalTypeAnnotation.getInstance()) + .columnOrder(ColumnOrder.typeDefined()).named("interval_unsupported")); + } + + @Test + public void testDecimalLogicalType() { + PrimitiveType expected = new PrimitiveType(REQUIRED, BINARY, "aDecimal", + LogicalTypeAnnotation.decimalType(3, 4)); + PrimitiveType actual = Types.required(BINARY) + .as(LogicalTypeAnnotation.decimalType(3, 4)).named("aDecimal"); + Assert.assertEquals(expected, actual); + } + + @Test + public void testDecimalLogicalTypeWithDeprecatedScale() { + PrimitiveType expected = new PrimitiveType(REQUIRED, BINARY, "aDecimal", + LogicalTypeAnnotation.decimalType(3, 4)); + PrimitiveType actual = Types.required(BINARY) + .as(LogicalTypeAnnotation.decimalType(3, 4)).scale(3).named("aDecimal"); + Assert.assertEquals(expected, actual); + } + + @Test + public void testDecimalLogicalTypeWithDeprecatedPrecision() { + PrimitiveType expected = new PrimitiveType(REQUIRED, BINARY, "aDecimal", + LogicalTypeAnnotation.decimalType(3, 4)); + PrimitiveType actual = Types.required(BINARY) + .as(LogicalTypeAnnotation.decimalType(3, 4)).precision(4).named("aDecimal"); + Assert.assertEquals(expected, actual); + } + + @Test + public void testTimestampLogicalTypeWithUTCParameter() { + PrimitiveType utcMillisExpected = new PrimitiveType(REQUIRED, INT64, "aTimestamp", + timestampType(true, MILLIS)); + PrimitiveType nonUtcMillisExpected = new PrimitiveType(REQUIRED, INT64, "aTimestamp", + timestampType(false, MILLIS)); + PrimitiveType utcMicrosExpected = new PrimitiveType(REQUIRED, INT64, "aTimestamp", + timestampType(true, MICROS)); + PrimitiveType nonUtcMicrosExpected = new PrimitiveType(REQUIRED, INT64, "aTimestamp", + timestampType(false, MICROS)); + + PrimitiveType utcMillisActual = Types.required(INT64) + .as(timestampType(true, MILLIS)).named("aTimestamp"); + PrimitiveType nonUtcMillisActual = Types.required(INT64) + .as(timestampType(false, MILLIS)).named("aTimestamp"); + PrimitiveType utcMicrosActual = Types.required(INT64) + .as(timestampType(true, MICROS)).named("aTimestamp"); + PrimitiveType nonUtcMicrosActual = Types.required(INT64) + .as(timestampType(false, MICROS)).named("aTimestamp"); + + Assert.assertEquals(utcMillisExpected, utcMillisActual); + Assert.assertEquals(nonUtcMillisExpected, nonUtcMillisActual); + Assert.assertEquals(utcMicrosExpected, utcMicrosActual); + Assert.assertEquals(nonUtcMicrosExpected, nonUtcMicrosActual); + } + + @Test(expected = IllegalArgumentException.class) + public void testDecimalLogicalTypeWithDeprecatedScaleMismatch() { + Types.required(BINARY) + .as(LogicalTypeAnnotation.decimalType(3, 4)) + .scale(4).named("aDecimal"); + } + + @Test(expected = IllegalArgumentException.class) + public void testDecimalLogicalTypeWithDeprecatedPrecisionMismatch() { + Types.required(BINARY) + .as(LogicalTypeAnnotation.decimalType(3, 4)) + .precision(5).named("aDecimal"); + } + + /** + * A convenience method to avoid a large number of @Test(expected=...) tests + * @param message A String message to describe this assertion + * @param expected An Exception class that the Runnable should throw + * @param callable A Callable that is expected to throw the exception + */ + public static void assertThrows( + String message, Class expected, Callable callable) { + try { + callable.call(); + Assert.fail("No exception was thrown (" + message + "), expected: " + + expected.getName()); + } catch (Exception actual) { + Assert.assertEquals(message, expected, actual.getClass()); + } + } +} diff --git a/parquet-common/pom.xml b/parquet-common/pom.xml index e7b2446a65..1009628544 100644 --- a/parquet-common/pom.xml +++ b/parquet-common/pom.xml @@ -38,8 +38,8 @@ org.apache.parquet - parquet-format - ${parquet.format.version} + parquet-format-structures + ${project.version} @@ -61,6 +61,12 @@ ${slf4j.version} test + + + org.apache.yetus + audience-annotations + 0.7.0 + diff --git a/parquet-format-structures/pom.xml b/parquet-format-structures/pom.xml new file mode 100644 index 0000000000..e69cced3b2 --- /dev/null +++ b/parquet-format-structures/pom.xml @@ -0,0 +1,206 @@ + + + + 4.0.0 + + + org.apache.parquet + parquet + ../pom.xml + 1.10.1-SNAPSHOT + + + parquet-format-structures + jar + + Apache Parquet Format Structures + http://parquet.apache.org/ + Parquet-mr related java classes to use the parquet-format thrift structures. + + + ${project.build.directory}/parquet-format-thrift + + + + + + + org.apache.maven.plugins + maven-dependency-plugin + + + unpack + generate-sources + + unpack + + + + + org.apache.parquet + parquet-format + ${parquet.format.version} + jar + + + parquet.thrift + ${parquet.thrift.path} + + + + + + + org.apache.thrift.tools + maven-thrift-plugin + 0.1.11 + + ${parquet.thrift.path} + ${format.thrift.executable} + + + + thrift-sources + generate-sources + + compile + + + + + + org.apache.maven.plugins + maven-shade-plugin + + + package + + shade + + + true + + + org.apache.thrift:libthrift + + + + + + org.apache.thrift:libthrift + + **/*.java + META-INF/LICENSE.txt + META-INF/NOTICE.txt + + + + + + org.apache.thrift + ${shade.prefix}.org.apache.thrift + + + + + + + + + org.apache.maven.plugins + maven-javadoc-plugin + + + -Xdoclint:none + + + + + + + + + org.apache.maven.plugins + maven-javadoc-plugin + + + -Xdoclint:none + + + + + + + + org.slf4j + slf4j-api + ${slf4j.version} + + + org.apache.thrift + libthrift + ${format.thrift.version} + + + + + + + + !windows + + + UnixClassOS + + + + org.codehaus.mojo + exec-maven-plugin + 1.2.1 + + + check-thrift-version + generate-sources + + exec + + + sh + ${basedir} + + -c + ${thrift.executable} -version | fgrep 'Thrift version ${thrift.version}' && exit 0; + echo "================================================================================="; + echo "========== [FATAL] Build is configured to require Thrift version ${thrift.version} =========="; + echo -n "========== Currently installed: "; + ${thrift.executable} -version; + echo "================================================================================="; + exit 1 + + + + + + + + + + + diff --git a/parquet-format-structures/src/main/java/org/apache/parquet/format/InterningProtocol.java b/parquet-format-structures/src/main/java/org/apache/parquet/format/InterningProtocol.java new file mode 100644 index 0000000000..a405d4f879 --- /dev/null +++ b/parquet-format-structures/src/main/java/org/apache/parquet/format/InterningProtocol.java @@ -0,0 +1,231 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.parquet.format; + +import java.nio.ByteBuffer; + +import org.apache.thrift.TException; +import org.apache.thrift.protocol.TField; +import org.apache.thrift.protocol.TList; +import org.apache.thrift.protocol.TMap; +import org.apache.thrift.protocol.TMessage; +import org.apache.thrift.protocol.TProtocol; +import org.apache.thrift.protocol.TSet; +import org.apache.thrift.protocol.TStruct; +import org.apache.thrift.transport.TTransport; + +/** + * TProtocol that interns the strings. + */ +public class InterningProtocol extends TProtocol { + + private final TProtocol delegate; + + public InterningProtocol(TProtocol delegate) { + super(delegate.getTransport()); + this.delegate = delegate; + } + + public TTransport getTransport() { + return delegate.getTransport(); + } + + public void writeMessageBegin(TMessage message) throws TException { + delegate.writeMessageBegin(message); + } + + public void writeMessageEnd() throws TException { + delegate.writeMessageEnd(); + } + + public int hashCode() { + return delegate.hashCode(); + } + + public void writeStructBegin(TStruct struct) throws TException { + delegate.writeStructBegin(struct); + } + + public void writeStructEnd() throws TException { + delegate.writeStructEnd(); + } + + public void writeFieldBegin(TField field) throws TException { + delegate.writeFieldBegin(field); + } + + public void writeFieldEnd() throws TException { + delegate.writeFieldEnd(); + } + + public void writeFieldStop() throws TException { + delegate.writeFieldStop(); + } + + public void writeMapBegin(TMap map) throws TException { + delegate.writeMapBegin(map); + } + + public void writeMapEnd() throws TException { + delegate.writeMapEnd(); + } + + public void writeListBegin(TList list) throws TException { + delegate.writeListBegin(list); + } + + public void writeListEnd() throws TException { + delegate.writeListEnd(); + } + + public void writeSetBegin(TSet set) throws TException { + delegate.writeSetBegin(set); + } + + public void writeSetEnd() throws TException { + delegate.writeSetEnd(); + } + + public void writeBool(boolean b) throws TException { + delegate.writeBool(b); + } + + public void writeByte(byte b) throws TException { + delegate.writeByte(b); + } + + public void writeI16(short i16) throws TException { + delegate.writeI16(i16); + } + + public void writeI32(int i32) throws TException { + delegate.writeI32(i32); + } + + public void writeI64(long i64) throws TException { + delegate.writeI64(i64); + } + + public void writeDouble(double dub) throws TException { + delegate.writeDouble(dub); + } + + public void writeString(String str) throws TException { + delegate.writeString(str); + } + + public void writeBinary(ByteBuffer buf) throws TException { + delegate.writeBinary(buf); + } + + public TMessage readMessageBegin() throws TException { + return delegate.readMessageBegin(); + } + + public void readMessageEnd() throws TException { + delegate.readMessageEnd(); + } + + public TStruct readStructBegin() throws TException { + return delegate.readStructBegin(); + } + + public void readStructEnd() throws TException { + delegate.readStructEnd(); + } + + public TField readFieldBegin() throws TException { + return delegate.readFieldBegin(); + } + + public void readFieldEnd() throws TException { + delegate.readFieldEnd(); + } + + public TMap readMapBegin() throws TException { + return delegate.readMapBegin(); + } + + public void readMapEnd() throws TException { + delegate.readMapEnd(); + } + + public TList readListBegin() throws TException { + return delegate.readListBegin(); + } + + public void readListEnd() throws TException { + delegate.readListEnd(); + } + + public TSet readSetBegin() throws TException { + return delegate.readSetBegin(); + } + + public void readSetEnd() throws TException { + delegate.readSetEnd(); + } + + public boolean equals(Object obj) { + return delegate.equals(obj); + } + + public boolean readBool() throws TException { + return delegate.readBool(); + } + + public byte readByte() throws TException { + return delegate.readByte(); + } + + public short readI16() throws TException { + return delegate.readI16(); + } + + public int readI32() throws TException { + return delegate.readI32(); + } + + public long readI64() throws TException { + return delegate.readI64(); + } + + public double readDouble() throws TException { + return delegate.readDouble(); + } + + public String readString() throws TException { + // this is where we intern the strings + return delegate.readString().intern(); + } + + public ByteBuffer readBinary() throws TException { + return delegate.readBinary(); + } + + public void reset() { + delegate.reset(); + } + + public String toString() { + return delegate.toString(); + } + +} diff --git a/parquet-format-structures/src/main/java/org/apache/parquet/format/LogicalTypes.java b/parquet-format-structures/src/main/java/org/apache/parquet/format/LogicalTypes.java new file mode 100644 index 0000000000..7c63e41daf --- /dev/null +++ b/parquet-format-structures/src/main/java/org/apache/parquet/format/LogicalTypes.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.parquet.format; + +/** + * Convenience instances of logical type classes. + */ +public class LogicalTypes { + public static class TimeUnits { + public static final TimeUnit MILLIS = TimeUnit.MILLIS(new MilliSeconds()); + public static final TimeUnit MICROS = TimeUnit.MICROS(new MicroSeconds()); + } + + public static LogicalType DECIMAL(int scale, int precision) { + return LogicalType.DECIMAL(new DecimalType(scale, precision)); + } + + public static final LogicalType UTF8 = LogicalType.STRING(new StringType()); + public static final LogicalType MAP = LogicalType.MAP(new MapType()); + public static final LogicalType LIST = LogicalType.LIST(new ListType()); + public static final LogicalType ENUM = LogicalType.ENUM(new EnumType()); + public static final LogicalType DATE = LogicalType.DATE(new DateType()); + public static final LogicalType TIME_MILLIS = LogicalType.TIME(new TimeType(true, TimeUnits.MILLIS)); + public static final LogicalType TIME_MICROS = LogicalType.TIME(new TimeType(true, TimeUnits.MICROS)); + public static final LogicalType TIMESTAMP_MILLIS = LogicalType.TIMESTAMP(new TimestampType(true, TimeUnits.MILLIS)); + public static final LogicalType TIMESTAMP_MICROS = LogicalType.TIMESTAMP(new TimestampType(true, TimeUnits.MICROS)); + public static final LogicalType INT_8 = LogicalType.INTEGER(new IntType((byte) 8, true)); + public static final LogicalType INT_16 = LogicalType.INTEGER(new IntType((byte) 16, true)); + public static final LogicalType INT_32 = LogicalType.INTEGER(new IntType((byte) 32, true)); + public static final LogicalType INT_64 = LogicalType.INTEGER(new IntType((byte) 64, true)); + public static final LogicalType UINT_8 = LogicalType.INTEGER(new IntType((byte) 8, false)); + public static final LogicalType UINT_16 = LogicalType.INTEGER(new IntType((byte) 16, false)); + public static final LogicalType UINT_32 = LogicalType.INTEGER(new IntType((byte) 32, false)); + public static final LogicalType UINT_64 = LogicalType.INTEGER(new IntType((byte) 64, false)); + public static final LogicalType UNKNOWN = LogicalType.UNKNOWN(new NullType()); + public static final LogicalType JSON = LogicalType.JSON(new JsonType()); + public static final LogicalType BSON = LogicalType.BSON(new BsonType()); +} diff --git a/parquet-format-structures/src/main/java/org/apache/parquet/format/Util.java b/parquet-format-structures/src/main/java/org/apache/parquet/format/Util.java new file mode 100644 index 0000000000..d09d007a20 --- /dev/null +++ b/parquet-format-structures/src/main/java/org/apache/parquet/format/Util.java @@ -0,0 +1,236 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.parquet.format; + +import static org.apache.parquet.format.FileMetaData._Fields.CREATED_BY; +import static org.apache.parquet.format.FileMetaData._Fields.KEY_VALUE_METADATA; +import static org.apache.parquet.format.FileMetaData._Fields.NUM_ROWS; +import static org.apache.parquet.format.FileMetaData._Fields.ROW_GROUPS; +import static org.apache.parquet.format.FileMetaData._Fields.SCHEMA; +import static org.apache.parquet.format.FileMetaData._Fields.VERSION; +import static org.apache.parquet.format.event.Consumers.fieldConsumer; +import static org.apache.parquet.format.event.Consumers.listElementsOf; +import static org.apache.parquet.format.event.Consumers.listOf; +import static org.apache.parquet.format.event.Consumers.struct; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.util.List; + +import org.apache.thrift.TBase; +import org.apache.thrift.TException; +import org.apache.thrift.protocol.TCompactProtocol; +import org.apache.thrift.protocol.TProtocol; +import org.apache.thrift.transport.TIOStreamTransport; + +import org.apache.parquet.format.event.Consumers.Consumer; +import org.apache.parquet.format.event.Consumers.DelegatingFieldConsumer; +import org.apache.parquet.format.event.EventBasedThriftReader; +import org.apache.parquet.format.event.TypedConsumer.I32Consumer; +import org.apache.parquet.format.event.TypedConsumer.I64Consumer; +import org.apache.parquet.format.event.TypedConsumer.StringConsumer; + +/** + * Utility to read/write metadata + * We use the TCompactProtocol to serialize metadata + */ +public class Util { + + public static void writeColumnIndex(ColumnIndex columnIndex, OutputStream to) throws IOException { + write(columnIndex, to); + } + + public static ColumnIndex readColumnIndex(InputStream from) throws IOException { + return read(from, new ColumnIndex()); + } + + public static void writeOffsetIndex(OffsetIndex offsetIndex, OutputStream to) throws IOException { + write(offsetIndex, to); + } + + public static OffsetIndex readOffsetIndex(InputStream from) throws IOException { + return read(from, new OffsetIndex()); + } + + public static void writePageHeader(PageHeader pageHeader, OutputStream to) throws IOException { + write(pageHeader, to); + } + + public static PageHeader readPageHeader(InputStream from) throws IOException { + return read(from, new PageHeader()); + } + + public static void writeFileMetaData(org.apache.parquet.format.FileMetaData fileMetadata, OutputStream to) throws IOException { + write(fileMetadata, to); + } + + public static FileMetaData readFileMetaData(InputStream from) throws IOException { + return read(from, new FileMetaData()); + } + /** + * reads the meta data from the stream + * @param from the stream to read the metadata from + * @param skipRowGroups whether row groups should be skipped + * @return the resulting metadata + * @throws IOException if any I/O error occurs during the reading + */ + public static FileMetaData readFileMetaData(InputStream from, boolean skipRowGroups) throws IOException { + FileMetaData md = new FileMetaData(); + if (skipRowGroups) { + readFileMetaData(from, new DefaultFileMetaDataConsumer(md), skipRowGroups); + } else { + read(from, md); + } + return md; + } + + /** + * To read metadata in a streaming fashion. + * + */ + public static abstract class FileMetaDataConsumer { + abstract public void setVersion(int version); + abstract public void setSchema(List schema); + abstract public void setNumRows(long numRows); + abstract public void addRowGroup(RowGroup rowGroup); + abstract public void addKeyValueMetaData(KeyValue kv); + abstract public void setCreatedBy(String createdBy); + } + + /** + * Simple default consumer that sets the fields + * + */ + public static final class DefaultFileMetaDataConsumer extends FileMetaDataConsumer { + private final FileMetaData md; + + public DefaultFileMetaDataConsumer(FileMetaData md) { + this.md = md; + } + + @Override + public void setVersion(int version) { + md.setVersion(version); + } + + @Override + public void setSchema(List schema) { + md.setSchema(schema); + } + + @Override + public void setNumRows(long numRows) { + md.setNum_rows(numRows); + } + + @Override + public void setCreatedBy(String createdBy) { + md.setCreated_by(createdBy); + } + + @Override + public void addRowGroup(RowGroup rowGroup) { + md.addToRow_groups(rowGroup); + } + + @Override + public void addKeyValueMetaData(KeyValue kv) { + md.addToKey_value_metadata(kv); + } + } + + public static void readFileMetaData(InputStream from, FileMetaDataConsumer consumer) throws IOException { + readFileMetaData(from, consumer, false); + } + + public static void readFileMetaData(InputStream from, final FileMetaDataConsumer consumer, boolean skipRowGroups) throws IOException { + try { + DelegatingFieldConsumer eventConsumer = fieldConsumer() + .onField(VERSION, new I32Consumer() { + @Override + public void consume(int value) { + consumer.setVersion(value); + } + }).onField(SCHEMA, listOf(SchemaElement.class, new Consumer>() { + @Override + public void consume(List schema) { + consumer.setSchema(schema); + } + })).onField(NUM_ROWS, new I64Consumer() { + @Override + public void consume(long value) { + consumer.setNumRows(value); + } + }).onField(KEY_VALUE_METADATA, listElementsOf(struct(KeyValue.class, new Consumer() { + @Override + public void consume(KeyValue kv) { + consumer.addKeyValueMetaData(kv); + } + }))).onField(CREATED_BY, new StringConsumer() { + @Override + public void consume(String value) { + consumer.setCreatedBy(value); + } + }); + if (!skipRowGroups) { + eventConsumer = eventConsumer.onField(ROW_GROUPS, listElementsOf(struct(RowGroup.class, new Consumer() { + @Override + public void consume(RowGroup rowGroup) { + consumer.addRowGroup(rowGroup); + } + }))); + } + new EventBasedThriftReader(protocol(from)).readStruct(eventConsumer); + + } catch (TException e) { + throw new IOException("can not read FileMetaData: " + e.getMessage(), e); + } + } + + private static TProtocol protocol(OutputStream to) { + return protocol(new TIOStreamTransport(to)); + } + + private static TProtocol protocol(InputStream from) { + return protocol(new TIOStreamTransport(from)); + } + + private static InterningProtocol protocol(TIOStreamTransport t) { + return new InterningProtocol(new TCompactProtocol(t)); + } + + private static > T read(InputStream from, T tbase) throws IOException { + try { + tbase.read(protocol(from)); + return tbase; + } catch (TException e) { + throw new IOException("can not read " + tbase.getClass() + ": " + e.getMessage(), e); + } + } + + private static void write(TBase tbase, OutputStream to) throws IOException { + try { + tbase.write(protocol(to)); + } catch (TException e) { + throw new IOException("can not write " + tbase, e); + } + } +} diff --git a/parquet-format-structures/src/main/java/org/apache/parquet/format/event/Consumers.java b/parquet-format-structures/src/main/java/org/apache/parquet/format/event/Consumers.java new file mode 100644 index 0000000000..ef87997e7a --- /dev/null +++ b/parquet-format-structures/src/main/java/org/apache/parquet/format/event/Consumers.java @@ -0,0 +1,193 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.format.event; + +import static java.util.Collections.unmodifiableMap; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.thrift.TBase; +import org.apache.thrift.TException; +import org.apache.thrift.TFieldIdEnum; +import org.apache.thrift.protocol.TList; +import org.apache.thrift.protocol.TProtocol; +import org.apache.thrift.protocol.TProtocolUtil; + +import org.apache.parquet.format.event.Consumers.Consumer; +import org.apache.parquet.format.event.TypedConsumer.ListConsumer; +import org.apache.parquet.format.event.TypedConsumer.StructConsumer; + +/** + * Entry point for reading thrift in a streaming fashion + */ +public class Consumers { + + /** + * To consume objects coming from a DelegatingFieldConsumer + * + * @param the type of consumed objects + */ + public static interface Consumer { + void consume(T t); + } + + /** + * Delegates reading the field to TypedConsumers. + * There is one TypedConsumer per thrift type. + * use {@link #onField(TFieldIdEnum, TypedConsumer)} et al. to consume specific thrift fields. + * @see Consumers#fieldConsumer() + */ + public static class DelegatingFieldConsumer implements FieldConsumer { + + private final Map contexts; + private final FieldConsumer defaultFieldEventConsumer; + + private DelegatingFieldConsumer(FieldConsumer defaultFieldEventConsumer, Map contexts) { + this.defaultFieldEventConsumer = defaultFieldEventConsumer; + this.contexts = unmodifiableMap(contexts); + } + + private DelegatingFieldConsumer() { + this(new SkippingFieldConsumer()); + } + + private DelegatingFieldConsumer(FieldConsumer defaultFieldEventConsumer) { + this(defaultFieldEventConsumer, Collections.emptyMap()); + } + + public DelegatingFieldConsumer onField(TFieldIdEnum e, TypedConsumer typedConsumer) { + Map newContexts = new HashMap(contexts); + newContexts.put(e.getThriftFieldId(), typedConsumer); + return new DelegatingFieldConsumer(defaultFieldEventConsumer, newContexts); + } + + @Override + public void consumeField( + TProtocol protocol, EventBasedThriftReader reader, + short id, byte type) throws TException { + TypedConsumer delegate = contexts.get(id); + if (delegate != null) { + delegate.read(protocol, reader, type); + } else { + defaultFieldEventConsumer.consumeField(protocol, reader, id, type); + } + } + } + + /** + * call onField on the resulting DelegatingFieldConsumer to handle individual fields + * @return a new DelegatingFieldConsumer + */ + public static DelegatingFieldConsumer fieldConsumer() { + return new DelegatingFieldConsumer(); + } + + /** + * To consume a list of elements + * @param c the class of the list content + * @param consumer the consumer that will receive the list + * @param the type of the list content + * @return a ListConsumer that can be passed to the DelegatingFieldConsumer + */ + public static > ListConsumer listOf(Class c, final Consumer> consumer) { + class ListConsumer implements Consumer { + List list; + @Override + public void consume(T t) { + list.add(t); + } + } + final ListConsumer co = new ListConsumer(); + return new DelegatingListElementsConsumer(struct(c, co)) { + @Override + public void consumeList(TProtocol protocol, + EventBasedThriftReader reader, TList tList) throws TException { + co.list = new ArrayList(); + super.consumeList(protocol, reader, tList); + consumer.consume(co.list); + } + }; + } + + /** + * To consume list elements one by one + * @param consumer the consumer that will read the elements + * @return a ListConsumer that can be passed to the DelegatingFieldConsumer + */ + public static ListConsumer listElementsOf(TypedConsumer consumer) { + return new DelegatingListElementsConsumer(consumer); + } + + public static > StructConsumer struct(final Class c, final Consumer consumer) { + return new TBaseStructConsumer(c, consumer); + } +} + +class SkippingFieldConsumer implements FieldConsumer { + @Override + public void consumeField(TProtocol protocol, EventBasedThriftReader reader, short id, byte type) throws TException { + TProtocolUtil.skip(protocol, type); + } +} + +class DelegatingListElementsConsumer extends ListConsumer { + + private TypedConsumer elementConsumer; + + protected DelegatingListElementsConsumer(TypedConsumer consumer) { + this.elementConsumer = consumer; + } + + @Override + public void consumeElement(TProtocol protocol, EventBasedThriftReader reader, byte elemType) throws TException { + elementConsumer.read(protocol, reader, elemType); + } +} +class TBaseStructConsumer> extends StructConsumer { + + private final Class c; + private Consumer consumer; + + public TBaseStructConsumer(Class c, Consumer consumer) { + this.c = c; + this.consumer = consumer; + } + + @Override + public void consumeStruct(TProtocol protocol, EventBasedThriftReader reader) throws TException { + T o = newObject(); + o.read(protocol); + consumer.consume(o); + } + + protected T newObject() { + try { + return c.newInstance(); + } catch (InstantiationException e) { + throw new RuntimeException(c.getName(), e); + } catch (IllegalAccessException e) { + throw new RuntimeException(c.getName(), e); + } + } + +} \ No newline at end of file diff --git a/parquet-format-structures/src/main/java/org/apache/parquet/format/event/EventBasedThriftReader.java b/parquet-format-structures/src/main/java/org/apache/parquet/format/event/EventBasedThriftReader.java new file mode 100644 index 0000000000..2fb9cf651f --- /dev/null +++ b/parquet-format-structures/src/main/java/org/apache/parquet/format/event/EventBasedThriftReader.java @@ -0,0 +1,126 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.format.event; + +import org.apache.thrift.TException; +import org.apache.thrift.protocol.TField; +import org.apache.thrift.protocol.TList; +import org.apache.thrift.protocol.TMap; +import org.apache.thrift.protocol.TProtocol; +import org.apache.thrift.protocol.TSet; +import org.apache.thrift.protocol.TType; + +import org.apache.parquet.format.event.TypedConsumer.ListConsumer; +import org.apache.parquet.format.event.TypedConsumer.MapConsumer; +import org.apache.parquet.format.event.TypedConsumer.SetConsumer; + +/** + * Event based reader for Thrift + */ +public final class EventBasedThriftReader { + + private final TProtocol protocol; + + /** + * @param protocol the protocol to read from + */ + public EventBasedThriftReader(TProtocol protocol) { + this.protocol = protocol; + } + + /** + * reads a Struct from the underlying protocol and passes the field events to the FieldConsumer + * @param c the field consumer + * @throws TException if any thrift related error occurs during the reading + */ + public void readStruct(FieldConsumer c) throws TException { + protocol.readStructBegin(); + readStructContent(c); + protocol.readStructEnd(); + } + + /** + * reads the content of a struct (fields) from the underlying protocol and passes the events to c + * @param c the field consumer + * @throws TException if any thrift related error occurs during the reading + */ + public void readStructContent(FieldConsumer c) throws TException { + TField field; + while (true) { + field = protocol.readFieldBegin(); + if (field.type == TType.STOP) { + break; + } + c.consumeField(protocol, this, field.id, field.type); + } + } + + /** + * reads the set content (elements) from the underlying protocol and passes the events to the set event consumer + * @param eventConsumer the consumer + * @param tSet the set descriptor + * @throws TException if any thrift related error occurs during the reading + */ + public void readSetContent(SetConsumer eventConsumer, TSet tSet) + throws TException { + for (int i = 0; i < tSet.size; i++) { + eventConsumer.consumeElement(protocol, this, tSet.elemType); + } + } + + /** + * reads the map content (key values) from the underlying protocol and passes the events to the map event consumer + * @param eventConsumer the consumer + * @param tMap the map descriptor + * @throws TException if any thrift related error occurs during the reading + */ + public void readMapContent(MapConsumer eventConsumer, TMap tMap) + throws TException { + for (int i = 0; i < tMap.size; i++) { + eventConsumer.consumeEntry(protocol, this, tMap.keyType, tMap.valueType); + } + } + + /** + * reads a key-value pair + * @param keyType the type of the key + * @param keyConsumer the consumer for the key + * @param valueType the type of the value + * @param valueConsumer the consumer for the value + * @throws TException if any thrift related error occurs during the reading + */ + public void readMapEntry(byte keyType, TypedConsumer keyConsumer, byte valueType, TypedConsumer valueConsumer) + throws TException { + keyConsumer.read(protocol, this, keyType); + valueConsumer.read(protocol, this, valueType); + } + + /** + * reads the list content (elements) from the underlying protocol and passes the events to the list event consumer + * @param eventConsumer the consumer + * @param tList the list descriptor + * @throws TException if any thrift related error occurs during the reading + */ + public void readListContent(ListConsumer eventConsumer, TList tList) + throws TException { + for (int i = 0; i < tList.size; i++) { + eventConsumer.consumeElement(protocol, this, tList.elemType); + } + } +} \ No newline at end of file diff --git a/parquet-format-structures/src/main/java/org/apache/parquet/format/event/FieldConsumer.java b/parquet-format-structures/src/main/java/org/apache/parquet/format/event/FieldConsumer.java new file mode 100644 index 0000000000..6656934b6c --- /dev/null +++ b/parquet-format-structures/src/main/java/org/apache/parquet/format/event/FieldConsumer.java @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.format.event; + +import org.apache.thrift.TException; +import org.apache.thrift.protocol.TProtocol; + +/** + * To receive Thrift field events + */ +public interface FieldConsumer { + + /** + * called by the EventBasedThriftReader when reading a field from a Struct + * @param protocol the underlying protocol + * @param eventBasedThriftReader the reader to delegate to further calls. + * @param id the id of the field + * @param type the type of the field + * @throws TException if any thrift related error occurs during the reading + */ + public void consumeField(TProtocol protocol, EventBasedThriftReader eventBasedThriftReader, short id, byte type) throws TException; + +} \ No newline at end of file diff --git a/parquet-format-structures/src/main/java/org/apache/parquet/format/event/TypedConsumer.java b/parquet-format-structures/src/main/java/org/apache/parquet/format/event/TypedConsumer.java new file mode 100644 index 0000000000..734449f5ed --- /dev/null +++ b/parquet-format-structures/src/main/java/org/apache/parquet/format/event/TypedConsumer.java @@ -0,0 +1,205 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.format.event; + +import static org.apache.thrift.protocol.TType.BOOL; +import static org.apache.thrift.protocol.TType.BYTE; +import static org.apache.thrift.protocol.TType.DOUBLE; +import static org.apache.thrift.protocol.TType.I16; +import static org.apache.thrift.protocol.TType.I32; +import static org.apache.thrift.protocol.TType.I64; +import static org.apache.thrift.protocol.TType.LIST; +import static org.apache.thrift.protocol.TType.MAP; +import static org.apache.thrift.protocol.TType.SET; +import static org.apache.thrift.protocol.TType.STRING; +import static org.apache.thrift.protocol.TType.STRUCT; + +import org.apache.thrift.TException; +import org.apache.thrift.protocol.TList; +import org.apache.thrift.protocol.TMap; +import org.apache.thrift.protocol.TProtocol; +import org.apache.thrift.protocol.TSet; + +/** + * receive thrift events of a given type + */ +abstract public class TypedConsumer { + + abstract public static class DoubleConsumer extends TypedConsumer { + protected DoubleConsumer() { super(DOUBLE); } + @Override + final void read(TProtocol protocol, EventBasedThriftReader reader) throws TException { + this.consume(protocol.readDouble()); + } + abstract public void consume(double value); + } + + abstract public static class ByteConsumer extends TypedConsumer { + protected ByteConsumer() { super(BYTE); } + @Override + final void read(TProtocol protocol, EventBasedThriftReader reader) throws TException { + this.consume(protocol.readByte()); + } + abstract public void consume(byte value); + } + + abstract public static class BoolConsumer extends TypedConsumer { + protected BoolConsumer() { super(BOOL); } + @Override + final void read(TProtocol protocol, EventBasedThriftReader reader) throws TException { + this.consume(protocol.readBool()); + } + abstract public void consume(boolean value); + } + + abstract public static class I32Consumer extends TypedConsumer { + protected I32Consumer() { super(I32); } + @Override + final void read(TProtocol protocol, EventBasedThriftReader reader) throws TException { + this.consume(protocol.readI32()); + } + abstract public void consume(int value); + } + + abstract public static class I64Consumer extends TypedConsumer { + protected I64Consumer() { super(I64); } + final void read(TProtocol protocol, EventBasedThriftReader reader) throws TException { + this.consume(protocol.readI64()); + } + abstract public void consume(long value); + } + + abstract public static class I16Consumer extends TypedConsumer { + protected I16Consumer() { super(I16); } + @Override + final void read(TProtocol protocol, EventBasedThriftReader reader) throws TException { + this.consume(protocol.readI16()); + } + abstract public void consume(short value); + } + + abstract public static class StringConsumer extends TypedConsumer { + protected StringConsumer() { super(STRING); } + @Override + final void read(TProtocol protocol, EventBasedThriftReader reader) throws TException { + this.consume(protocol.readString()); + } + abstract public void consume(String value); + } + + abstract public static class StructConsumer extends TypedConsumer { + protected StructConsumer() { super(STRUCT); } + @Override + final void read(TProtocol protocol, EventBasedThriftReader reader) throws TException { + this.consumeStruct(protocol, reader); + } + /** + * can either delegate to the reader or read the struct from the protocol + * reader.readStruct(fieldConsumer); + * @param protocol the underlying protocol + * @param reader the reader to delegate to + * @throws TException if any thrift related error occurs during the reading + */ + abstract public void consumeStruct(TProtocol protocol, EventBasedThriftReader reader) throws TException; + } + + abstract public static class ListConsumer extends TypedConsumer { + protected ListConsumer() { super(LIST); } + @Override + final void read(TProtocol protocol, EventBasedThriftReader reader) throws TException { + this.consumeList(protocol, reader, protocol.readListBegin()); + protocol.readListEnd(); + } + public void consumeList(TProtocol protocol, EventBasedThriftReader reader, TList tList) throws TException { + reader.readListContent(this, tList); + } + /** + * can either delegate to the reader or read the element from the protocol + * @param protocol the underlying protocol + * @param reader the reader to delegate to + * @param elemType the type of the element + * @throws TException if any thrift related error occurs during the reading + */ + abstract public void consumeElement(TProtocol protocol, EventBasedThriftReader reader, byte elemType) throws TException; + } + + abstract public static class SetConsumer extends TypedConsumer { + protected SetConsumer() { super(SET); } + @Override + final void read(TProtocol protocol, EventBasedThriftReader reader) throws TException { + this.consumeSet(protocol, reader, protocol.readSetBegin()); + protocol.readSetEnd(); + } + public void consumeSet(TProtocol protocol, EventBasedThriftReader reader, TSet tSet) throws TException { + reader.readSetContent(this, tSet); + } + /** + * can either delegate to the reader or read the set from the protocol + * @param protocol the underlying protocol + * @param reader the reader to delegate to + * @param elemType the type of the element + * @throws TException if any thrift related error occurs during the reading + */ + abstract public void consumeElement( + TProtocol protocol, EventBasedThriftReader reader, + byte elemType) throws TException; + } + + abstract public static class MapConsumer extends TypedConsumer { + protected MapConsumer() { super(MAP); } + @Override + final void read(TProtocol protocol, EventBasedThriftReader reader) + throws TException { + this.consumeMap(protocol, reader , protocol.readMapBegin()); + protocol.readMapEnd(); + } + public void consumeMap(TProtocol protocol, EventBasedThriftReader reader, TMap tMap) throws TException { + reader.readMapContent(this, tMap); + } + /** + * can either delegate to the reader or read the map entry from the protocol + * @param protocol the underlying protocol + * @param reader the reader to delegate to + * @param keyType the type of the key + * @param valueType the type of the value + * @throws TException if any thrift related error occurs during the reading + */ + abstract public void consumeEntry( + TProtocol protocol, EventBasedThriftReader reader, + byte keyType, byte valueType) throws TException; + } + + public final byte type; + + private TypedConsumer(byte type) { + this.type = type; + } + + final public void read(TProtocol protocol, EventBasedThriftReader reader, byte type) throws TException { + if (this.type != type) { + throw new TException( + "Incorrect type in stream. " + + "Expected " + this.type + + " but got " + type); + } + this.read(protocol, reader); + } + + abstract void read(TProtocol protocol, EventBasedThriftReader reader) throws TException; +} \ No newline at end of file diff --git a/parquet-format-structures/src/test/java/org/apache/parquet/format/TestUtil.java b/parquet-format-structures/src/test/java/org/apache/parquet/format/TestUtil.java new file mode 100644 index 0000000000..1adf0998fb --- /dev/null +++ b/parquet-format-structures/src/test/java/org/apache/parquet/format/TestUtil.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.format; + +import static java.util.Arrays.asList; +import static junit.framework.Assert.assertEquals; +import static junit.framework.Assert.assertNull; +import static org.apache.parquet.format.Util.readFileMetaData; +import static org.apache.parquet.format.Util.writeFileMetaData; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; + +import org.junit.Test; + +import org.apache.parquet.format.Util.DefaultFileMetaDataConsumer; +public class TestUtil { + + @Test + public void testReadFileMetadata() throws Exception { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + FileMetaData md = new FileMetaData( + 1, + asList(new SchemaElement("foo")), + 10, + asList( + new RowGroup( + asList( + new ColumnChunk(0), + new ColumnChunk(1) + ), + 10, + 5), + new RowGroup( + asList( + new ColumnChunk(2), + new ColumnChunk(3) + ), + 11, + 5) + ) + ); + writeFileMetaData(md , baos); + FileMetaData md2 = readFileMetaData(in(baos)); + FileMetaData md3 = new FileMetaData(); + readFileMetaData(in(baos), new DefaultFileMetaDataConsumer(md3)); + FileMetaData md4 = new FileMetaData(); + readFileMetaData(in(baos), new DefaultFileMetaDataConsumer(md4), true); + FileMetaData md5 = readFileMetaData(in(baos), true); + FileMetaData md6 = readFileMetaData(in(baos), false); + assertEquals(md, md2); + assertEquals(md, md3); + assertNull(md4.getRow_groups()); + assertNull(md5.getRow_groups()); + assertEquals(md4, md5); + md4.setRow_groups(md.getRow_groups()); + md5.setRow_groups(md.getRow_groups()); + assertEquals(md, md4); + assertEquals(md, md5); + assertEquals(md4, md5); + assertEquals(md, md6); + } + + private ByteArrayInputStream in(ByteArrayOutputStream baos) { + return new ByteArrayInputStream(baos.toByteArray()); + } +} diff --git a/parquet-hadoop/pom.xml b/parquet-hadoop/pom.xml index 98972a2357..8d31f7dd03 100644 --- a/parquet-hadoop/pom.xml +++ b/parquet-hadoop/pom.xml @@ -43,8 +43,8 @@ org.apache.parquet - parquet-format - ${parquet.format.version} + parquet-format-structures + ${project.version} org.apache.hadoop diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/HadoopReadOptions.java b/parquet-hadoop/src/main/java/org/apache/parquet/HadoopReadOptions.java index b8f481e8a7..4f5c78adb2 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/HadoopReadOptions.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/HadoopReadOptions.java @@ -28,6 +28,7 @@ import java.util.Map; +import static org.apache.parquet.hadoop.ParquetInputFormat.COLUMN_INDEX_FILTERING_ENABLED; import static org.apache.parquet.hadoop.ParquetInputFormat.DICTIONARY_FILTERING_ENABLED; import static org.apache.parquet.hadoop.ParquetInputFormat.RECORD_FILTERING_ENABLED; import static org.apache.parquet.hadoop.ParquetInputFormat.STATS_FILTERING_ENABLED; @@ -43,6 +44,7 @@ private HadoopReadOptions(boolean useSignedStringMinMax, boolean useStatsFilter, boolean useDictionaryFilter, boolean useRecordFilter, + boolean useColumnIndexFilter, FilterCompat.Filter recordFilter, MetadataFilter metadataFilter, CompressionCodecFactory codecFactory, @@ -51,8 +53,8 @@ private HadoopReadOptions(boolean useSignedStringMinMax, Map properties, Configuration conf) { super( - useSignedStringMinMax, useStatsFilter, useDictionaryFilter, useRecordFilter, recordFilter, - metadataFilter, codecFactory, allocator, maxAllocationSize, properties + useSignedStringMinMax, useStatsFilter, useDictionaryFilter, useRecordFilter, useColumnIndexFilter, + recordFilter, metadataFilter, codecFactory, allocator, maxAllocationSize, properties ); this.conf = conf; } @@ -83,6 +85,7 @@ public Builder(Configuration conf) { useDictionaryFilter(conf.getBoolean(DICTIONARY_FILTERING_ENABLED, true)); useStatsFilter(conf.getBoolean(STATS_FILTERING_ENABLED, true)); useRecordFilter(conf.getBoolean(RECORD_FILTERING_ENABLED, true)); + useColumnIndexFilter(conf.getBoolean(COLUMN_INDEX_FILTERING_ENABLED, true)); withCodecFactory(HadoopCodecs.newFactory(conf, 0)); withRecordFilter(getFilter(conf)); withMaxAllocationInBytes(conf.getInt(ALLOCATION_SIZE, 8388608)); @@ -95,7 +98,7 @@ public Builder(Configuration conf) { @Override public ParquetReadOptions build() { return new HadoopReadOptions( - useSignedStringMinMax, useStatsFilter, useDictionaryFilter, useRecordFilter, + useSignedStringMinMax, useStatsFilter, useDictionaryFilter, useRecordFilter, useColumnIndexFilter, recordFilter, metadataFilter, codecFactory, allocator, maxAllocationSize, properties, conf); } diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/ParquetReadOptions.java b/parquet-hadoop/src/main/java/org/apache/parquet/ParquetReadOptions.java index 4ef24601c9..846d3bd809 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/ParquetReadOptions.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/ParquetReadOptions.java @@ -38,12 +38,14 @@ public class ParquetReadOptions { private static final boolean RECORD_FILTERING_ENABLED_DEFAULT = true; private static final boolean STATS_FILTERING_ENABLED_DEFAULT = true; private static final boolean DICTIONARY_FILTERING_ENABLED_DEFAULT = true; + private static final boolean COLUMN_INDEX_FILTERING_ENABLED_DEFAULT = true; private static final int ALLOCATION_SIZE_DEFAULT = 8388608; // 8MB private final boolean useSignedStringMinMax; private final boolean useStatsFilter; private final boolean useDictionaryFilter; private final boolean useRecordFilter; + private final boolean useColumnIndexFilter; private final FilterCompat.Filter recordFilter; private final ParquetMetadataConverter.MetadataFilter metadataFilter; private final CompressionCodecFactory codecFactory; @@ -55,6 +57,7 @@ public class ParquetReadOptions { boolean useStatsFilter, boolean useDictionaryFilter, boolean useRecordFilter, + boolean useColumnIndexFilter, FilterCompat.Filter recordFilter, ParquetMetadataConverter.MetadataFilter metadataFilter, CompressionCodecFactory codecFactory, @@ -65,6 +68,7 @@ public class ParquetReadOptions { this.useStatsFilter = useStatsFilter; this.useDictionaryFilter = useDictionaryFilter; this.useRecordFilter = useRecordFilter; + this.useColumnIndexFilter = useColumnIndexFilter; this.recordFilter = recordFilter; this.metadataFilter = metadataFilter; this.codecFactory = codecFactory; @@ -89,6 +93,10 @@ public boolean useRecordFilter() { return useRecordFilter; } + public boolean useColumnIndexFilter() { + return useColumnIndexFilter; + } + public FilterCompat.Filter getRecordFilter() { return recordFilter; } @@ -134,6 +142,7 @@ public static class Builder { protected boolean useStatsFilter = STATS_FILTERING_ENABLED_DEFAULT; protected boolean useDictionaryFilter = DICTIONARY_FILTERING_ENABLED_DEFAULT; protected boolean useRecordFilter = RECORD_FILTERING_ENABLED_DEFAULT; + protected boolean useColumnIndexFilter = COLUMN_INDEX_FILTERING_ENABLED_DEFAULT; protected FilterCompat.Filter recordFilter = null; protected ParquetMetadataConverter.MetadataFilter metadataFilter = NO_FILTER; // the page size parameter isn't used when only using the codec factory to get decompressors @@ -182,6 +191,15 @@ public Builder useRecordFilter() { return this; } + public Builder useColumnIndexFilter(boolean useColumnIndexFilter) { + this.useColumnIndexFilter = useColumnIndexFilter; + return this; + } + + public Builder useColumnIndexFilter() { + return useColumnIndexFilter(true); + } + public Builder withRecordFilter(FilterCompat.Filter rowGroupFilter) { this.recordFilter = rowGroupFilter; return this; @@ -239,7 +257,7 @@ public Builder copy(ParquetReadOptions options) { public ParquetReadOptions build() { return new ParquetReadOptions( - useSignedStringMinMax, useStatsFilter, useDictionaryFilter, useRecordFilter, + useSignedStringMinMax, useStatsFilter, useDictionaryFilter, useRecordFilter, useColumnIndexFilter, recordFilter, metadataFilter, codecFactory, allocator, maxAllocationSize, properties); } } diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java index 7f2a766a47..468ae0277f 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java @@ -18,6 +18,10 @@ */ package org.apache.parquet.format.converter; +import static java.util.Optional.empty; + +import static java.util.Optional.empty; +import static java.util.Optional.of; import static org.apache.parquet.format.Util.readFileMetaData; import static org.apache.parquet.format.Util.writePageHeader; @@ -52,6 +56,7 @@ import org.apache.parquet.format.MapType; import org.apache.parquet.format.MicroSeconds; import org.apache.parquet.format.MilliSeconds; +import org.apache.parquet.format.NanoSeconds; import org.apache.parquet.format.NullType; import org.apache.parquet.format.PageEncodingStats; import org.apache.parquet.format.StringType; @@ -59,7 +64,9 @@ import org.apache.parquet.format.TimeUnit; import org.apache.parquet.format.TimestampType; import org.apache.parquet.hadoop.metadata.ColumnPath; +import org.apache.parquet.format.BoundaryOrder; import org.apache.parquet.format.ColumnChunk; +import org.apache.parquet.format.ColumnIndex; import org.apache.parquet.format.ColumnMetaData; import org.apache.parquet.format.ColumnOrder; import org.apache.parquet.format.ConvertedType; @@ -70,7 +77,9 @@ import org.apache.parquet.format.FieldRepetitionType; import org.apache.parquet.format.FileMetaData; import org.apache.parquet.format.KeyValue; +import org.apache.parquet.format.OffsetIndex; import org.apache.parquet.format.PageHeader; +import org.apache.parquet.format.PageLocation; import org.apache.parquet.format.PageType; import org.apache.parquet.format.RowGroup; import org.apache.parquet.format.SchemaElement; @@ -82,6 +91,9 @@ import org.apache.parquet.hadoop.metadata.CompressionCodecName; import org.apache.parquet.column.EncodingStats; import org.apache.parquet.hadoop.metadata.ParquetMetadata; +import org.apache.parquet.internal.column.columnindex.ColumnIndexBuilder; +import org.apache.parquet.internal.column.columnindex.OffsetIndexBuilder; +import org.apache.parquet.internal.hadoop.metadata.IndexReference; import org.apache.parquet.io.ParquetDecodingException; import org.apache.parquet.schema.ColumnOrder.ColumnOrderName; import org.apache.parquet.schema.GroupType; @@ -248,7 +260,7 @@ LogicalType convertToLogicalType(LogicalTypeAnnotation logicalTypeAnnotation) { } ConvertedType convertToConvertedType(LogicalTypeAnnotation logicalTypeAnnotation) { - return logicalTypeAnnotation.accept(CONVERTED_TYPE_CONVERTER_VISITOR).get(); + return logicalTypeAnnotation.accept(CONVERTED_TYPE_CONVERTER_VISITOR).orElse(null); } static org.apache.parquet.format.TimeUnit convertUnit(LogicalTypeAnnotation.TimeUnit unit) { @@ -257,6 +269,8 @@ static org.apache.parquet.format.TimeUnit convertUnit(LogicalTypeAnnotation.Time return org.apache.parquet.format.TimeUnit.MICROS(new MicroSeconds()); case MILLIS: return org.apache.parquet.format.TimeUnit.MILLIS(new MilliSeconds()); + case NANOS: + return TimeUnit.NANOS(new NanoSeconds()); default: throw new RuntimeException("Unknown time unit " + unit); } @@ -264,161 +278,165 @@ static org.apache.parquet.format.TimeUnit convertUnit(LogicalTypeAnnotation.Time private static class ConvertedTypeConverterVisitor implements LogicalTypeAnnotation.LogicalTypeAnnotationVisitor { @Override - public Optional visit(LogicalTypeAnnotation.StringLogicalTypeAnnotation logicalTypeAnnotation) { - return Optional.of(ConvertedType.UTF8); + public Optional visit(LogicalTypeAnnotation.StringLogicalTypeAnnotation stringLogicalType) { + return of(ConvertedType.UTF8); } @Override - public Optional visit(LogicalTypeAnnotation.MapLogicalTypeAnnotation logicalTypeAnnotation) { - return Optional.of(ConvertedType.MAP); + public Optional visit(LogicalTypeAnnotation.MapLogicalTypeAnnotation mapLogicalType) { + return of(ConvertedType.MAP); } @Override - public Optional visit(LogicalTypeAnnotation.ListLogicalTypeAnnotation logicalTypeAnnotation) { - return Optional.of(ConvertedType.LIST); + public Optional visit(LogicalTypeAnnotation.ListLogicalTypeAnnotation listLogicalType) { + return of(ConvertedType.LIST); } @Override - public Optional visit(LogicalTypeAnnotation.EnumLogicalTypeAnnotation logicalTypeAnnotation) { - return Optional.of(ConvertedType.ENUM); + public Optional visit(LogicalTypeAnnotation.EnumLogicalTypeAnnotation enumLogicalType) { + return of(ConvertedType.ENUM); } @Override - public Optional visit(LogicalTypeAnnotation.DecimalLogicalTypeAnnotation logicalTypeAnnotation) { - return Optional.of(ConvertedType.DECIMAL); + public Optional visit(LogicalTypeAnnotation.DecimalLogicalTypeAnnotation decimalLogicalType) { + return of(ConvertedType.DECIMAL); } @Override - public Optional visit(LogicalTypeAnnotation.DateLogicalTypeAnnotation logicalTypeAnnotation) { - return Optional.of(ConvertedType.DATE); + public Optional visit(LogicalTypeAnnotation.DateLogicalTypeAnnotation dateLogicalType) { + return of(ConvertedType.DATE); } @Override - public Optional visit(LogicalTypeAnnotation.TimeLogicalTypeAnnotation logicalTypeAnnotation) { - switch (logicalTypeAnnotation.getUnit()) { + public Optional visit(LogicalTypeAnnotation.TimeLogicalTypeAnnotation timeLogicalType) { + switch (timeLogicalType.getUnit()) { case MILLIS: - return Optional.of(ConvertedType.TIME_MILLIS); + return of(ConvertedType.TIME_MILLIS); case MICROS: - return Optional.of(ConvertedType.TIME_MICROS); + return of(ConvertedType.TIME_MICROS); + case NANOS: + return empty(); default: - throw new RuntimeException("Unknown converted type for " + logicalTypeAnnotation.toOriginalType()); + throw new RuntimeException("Unknown converted type for " + timeLogicalType.toOriginalType()); } } @Override - public Optional visit(LogicalTypeAnnotation.TimestampLogicalTypeAnnotation logicalTypeAnnotation) { - switch (logicalTypeAnnotation.getUnit()) { + public Optional visit(LogicalTypeAnnotation.TimestampLogicalTypeAnnotation timestampLogicalType) { + switch (timestampLogicalType.getUnit()) { case MICROS: - return Optional.of(ConvertedType.TIMESTAMP_MICROS); + return of(ConvertedType.TIMESTAMP_MICROS); case MILLIS: - return Optional.of(ConvertedType.TIMESTAMP_MILLIS); + return of(ConvertedType.TIMESTAMP_MILLIS); + case NANOS: + return empty(); default: - throw new RuntimeException("Unknown converted type for " + logicalTypeAnnotation.toOriginalType()); + throw new RuntimeException("Unknown converted type for " + timestampLogicalType.toOriginalType()); } } @Override - public Optional visit(LogicalTypeAnnotation.IntLogicalTypeAnnotation logicalTypeAnnotation) { - boolean signed = logicalTypeAnnotation.isSigned(); - switch (logicalTypeAnnotation.getBitWidth()) { + public Optional visit(LogicalTypeAnnotation.IntLogicalTypeAnnotation intLogicalType) { + boolean signed = intLogicalType.isSigned(); + switch (intLogicalType.getBitWidth()) { case 8: - return Optional.of(signed ? ConvertedType.INT_8 : ConvertedType.UINT_8); + return of(signed ? ConvertedType.INT_8 : ConvertedType.UINT_8); case 16: - return Optional.of(signed ? ConvertedType.INT_16 : ConvertedType.UINT_16); + return of(signed ? ConvertedType.INT_16 : ConvertedType.UINT_16); case 32: - return Optional.of(signed ? ConvertedType.INT_32 : ConvertedType.UINT_32); + return of(signed ? ConvertedType.INT_32 : ConvertedType.UINT_32); case 64: - return Optional.of(signed ? ConvertedType.INT_64 : ConvertedType.UINT_64); + return of(signed ? ConvertedType.INT_64 : ConvertedType.UINT_64); default: - throw new RuntimeException("Unknown original type " + logicalTypeAnnotation.toOriginalType()); + throw new RuntimeException("Unknown original type " + intLogicalType.toOriginalType()); } } @Override - public Optional visit(LogicalTypeAnnotation.JsonLogicalTypeAnnotation logicalTypeAnnotation) { - return Optional.of(ConvertedType.JSON); + public Optional visit(LogicalTypeAnnotation.JsonLogicalTypeAnnotation jsonLogicalType) { + return of(ConvertedType.JSON); } @Override - public Optional visit(LogicalTypeAnnotation.BsonLogicalTypeAnnotation logicalTypeAnnotation) { - return Optional.of(ConvertedType.BSON); + public Optional visit(LogicalTypeAnnotation.BsonLogicalTypeAnnotation bsonLogicalType) { + return of(ConvertedType.BSON); } @Override - public Optional visit(LogicalTypeAnnotation.IntervalLogicalTypeAnnotation logicalTypeAnnotation) { - return Optional.of(ConvertedType.INTERVAL); + public Optional visit(LogicalTypeAnnotation.IntervalLogicalTypeAnnotation intervalLogicalType) { + return of(ConvertedType.INTERVAL); } @Override - public Optional visit(LogicalTypeAnnotation.MapKeyValueTypeAnnotation logicalTypeAnnotation) { - return Optional.of(ConvertedType.MAP_KEY_VALUE); + public Optional visit(LogicalTypeAnnotation.MapKeyValueTypeAnnotation mapKeyValueLogicalType) { + return of(ConvertedType.MAP_KEY_VALUE); } } private static class LogicalTypeConverterVisitor implements LogicalTypeAnnotation.LogicalTypeAnnotationVisitor { @Override - public Optional visit(LogicalTypeAnnotation.StringLogicalTypeAnnotation logicalTypeAnnotation) { - return Optional.of(LogicalType.STRING(new StringType())); + public Optional visit(LogicalTypeAnnotation.StringLogicalTypeAnnotation stringLogicalType) { + return of(LogicalType.STRING(new StringType())); } @Override - public Optional visit(LogicalTypeAnnotation.MapLogicalTypeAnnotation logicalTypeAnnotation) { - return Optional.of(LogicalType.MAP(new MapType())); + public Optional visit(LogicalTypeAnnotation.MapLogicalTypeAnnotation mapLogicalType) { + return of(LogicalType.MAP(new MapType())); } @Override - public Optional visit(LogicalTypeAnnotation.ListLogicalTypeAnnotation logicalTypeAnnotation) { - return Optional.of(LogicalType.LIST(new ListType())); + public Optional visit(LogicalTypeAnnotation.ListLogicalTypeAnnotation listLogicalType) { + return of(LogicalType.LIST(new ListType())); } @Override - public Optional visit(LogicalTypeAnnotation.EnumLogicalTypeAnnotation logicalTypeAnnotation) { - return Optional.of(LogicalType.ENUM(new EnumType())); + public Optional visit(LogicalTypeAnnotation.EnumLogicalTypeAnnotation enumLogicalType) { + return of(LogicalType.ENUM(new EnumType())); } @Override - public Optional visit(LogicalTypeAnnotation.DecimalLogicalTypeAnnotation logicalTypeAnnotation) { - return Optional.of(LogicalType.DECIMAL(new DecimalType(logicalTypeAnnotation.getScale(), logicalTypeAnnotation.getPrecision()))); + public Optional visit(LogicalTypeAnnotation.DecimalLogicalTypeAnnotation decimalLogicalType) { + return of(LogicalType.DECIMAL(new DecimalType(decimalLogicalType.getScale(), decimalLogicalType.getPrecision()))); } @Override - public Optional visit(LogicalTypeAnnotation.DateLogicalTypeAnnotation logicalTypeAnnotation) { - return Optional.of(LogicalType.DATE(new DateType())); + public Optional visit(LogicalTypeAnnotation.DateLogicalTypeAnnotation dateLogicalType) { + return of(LogicalType.DATE(new DateType())); } @Override - public Optional visit(LogicalTypeAnnotation.TimeLogicalTypeAnnotation logicalTypeAnnotation) { - return Optional.of(LogicalType.TIME(new TimeType(logicalTypeAnnotation.isAdjustedToUTC(), convertUnit(logicalTypeAnnotation.getUnit())))); + public Optional visit(LogicalTypeAnnotation.TimeLogicalTypeAnnotation timeLogicalType) { + return of(LogicalType.TIME(new TimeType(timeLogicalType.isAdjustedToUTC(), convertUnit(timeLogicalType.getUnit())))); } @Override - public Optional visit(LogicalTypeAnnotation.TimestampLogicalTypeAnnotation logicalTypeAnnotation) { - return Optional.of(LogicalType.TIMESTAMP(new TimestampType(logicalTypeAnnotation.isAdjustedToUTC(), convertUnit(logicalTypeAnnotation.getUnit())))); + public Optional visit(LogicalTypeAnnotation.TimestampLogicalTypeAnnotation timestampLogicalType) { + return of(LogicalType.TIMESTAMP(new TimestampType(timestampLogicalType.isAdjustedToUTC(), convertUnit(timestampLogicalType.getUnit())))); } @Override - public Optional visit(LogicalTypeAnnotation.IntLogicalTypeAnnotation logicalTypeAnnotation) { - return Optional.of(LogicalType.INTEGER(new IntType((byte) logicalTypeAnnotation.getBitWidth(), logicalTypeAnnotation.isSigned()))); + public Optional visit(LogicalTypeAnnotation.IntLogicalTypeAnnotation intLogicalType) { + return of(LogicalType.INTEGER(new IntType((byte) intLogicalType.getBitWidth(), intLogicalType.isSigned()))); } @Override - public Optional visit(LogicalTypeAnnotation.JsonLogicalTypeAnnotation logicalTypeAnnotation) { - return Optional.of(LogicalType.JSON(new JsonType())); + public Optional visit(LogicalTypeAnnotation.JsonLogicalTypeAnnotation jsonLogicalType) { + return of(LogicalType.JSON(new JsonType())); } @Override - public Optional visit(LogicalTypeAnnotation.BsonLogicalTypeAnnotation logicalTypeAnnotation) { - return Optional.of(LogicalType.BSON(new BsonType())); + public Optional visit(LogicalTypeAnnotation.BsonLogicalTypeAnnotation bsonLogicalType) { + return of(LogicalType.BSON(new BsonType())); } @Override - public Optional visit(LogicalTypeAnnotation.IntervalLogicalTypeAnnotation logicalTypeAnnotation) { - return Optional.of(LogicalType.UNKNOWN(new NullType())); + public Optional visit(LogicalTypeAnnotation.IntervalLogicalTypeAnnotation intervalLogicalType) { + return of(LogicalType.UNKNOWN(new NullType())); } @Override - public Optional visit(LogicalTypeAnnotation.MapKeyValueTypeAnnotation logicalTypeAnnotation) { - return Optional.of(LogicalType.UNKNOWN(new NullType())); + public Optional visit(LogicalTypeAnnotation.MapKeyValueTypeAnnotation mapKeyValueLogicalType) { + return of(LogicalType.UNKNOWN(new NullType())); } } @@ -449,6 +467,17 @@ private void addRowGroup(ParquetMetadata parquetMetadata, List rowGrou // columnChunk.meta_data.index_page_offset = ; // columnChunk.meta_data.key_value_metadata = ; // nothing yet + IndexReference columnIndexRef = columnMetaData.getColumnIndexReference(); + if (columnIndexRef != null) { + columnChunk.setColumn_index_offset(columnIndexRef.getOffset()); + columnChunk.setColumn_index_length(columnIndexRef.getLength()); + } + IndexReference offsetIndexRef = columnMetaData.getOffsetIndexReference(); + if (offsetIndexRef != null) { + columnChunk.setOffset_index_offset(offsetIndexRef.getOffset()); + columnChunk.setOffset_index_length(offsetIndexRef.getLength()); + } + parquetColumns.add(columnChunk); } RowGroup rowGroup = new RowGroup(parquetColumns, block.getTotalByteSize(), block.getRowCount()); @@ -670,9 +699,11 @@ enum SortOrder { UNKNOWN } - private static final Set STRING_TYPES = Collections + private static final Set STRING_TYPES = Collections .unmodifiableSet(new HashSet<>(Arrays.asList( - OriginalType.UTF8, OriginalType.ENUM, OriginalType.JSON + LogicalTypeAnnotation.StringLogicalTypeAnnotation.class, + LogicalTypeAnnotation.EnumLogicalTypeAnnotation.class, + LogicalTypeAnnotation.JsonLogicalTypeAnnotation.class ))); /** @@ -689,10 +720,10 @@ private boolean overrideSortOrderToSigned(PrimitiveType type) { // even if the override is set, only return stats for string-ish types // a null type annotation is considered string-ish because some writers // failed to use the UTF8 annotation. - OriginalType annotation = type.getOriginalType(); + LogicalTypeAnnotation annotation = type.getLogicalTypeAnnotation(); return useSignedStringMinMax && PrimitiveTypeName.BINARY == type.getPrimitiveTypeName() && - (annotation == null || STRING_TYPES.contains(annotation)); + (annotation == null || STRING_TYPES.contains(annotation.getClass())); } /** @@ -719,36 +750,76 @@ private static SortOrder defaultSortOrder(PrimitiveTypeName primitive) { * @return the "correct" sort order of the type that applications assume */ private static SortOrder sortOrder(PrimitiveType primitive) { - OriginalType annotation = primitive.getOriginalType(); + LogicalTypeAnnotation annotation = primitive.getLogicalTypeAnnotation(); if (annotation != null) { - switch (annotation) { - case INT_8: - case INT_16: - case INT_32: - case INT_64: - case DATE: - case TIME_MICROS: - case TIME_MILLIS: - case TIMESTAMP_MICROS: - case TIMESTAMP_MILLIS: - return SortOrder.SIGNED; - case UINT_8: - case UINT_16: - case UINT_32: - case UINT_64: - case ENUM: - case UTF8: - case BSON: - case JSON: - return SortOrder.UNSIGNED; - case DECIMAL: - case LIST: - case MAP: - case MAP_KEY_VALUE: - case INTERVAL: - return SortOrder.UNKNOWN; - } + return annotation.accept(new LogicalTypeAnnotation.LogicalTypeAnnotationVisitor() { + @Override + public Optional visit(LogicalTypeAnnotation.IntLogicalTypeAnnotation intLogicalType) { + return intLogicalType.isSigned() ? of(SortOrder.SIGNED) : of(SortOrder.UNSIGNED); + } + + @Override + public Optional visit(LogicalTypeAnnotation.IntervalLogicalTypeAnnotation intervalLogicalType) { + return of(SortOrder.UNKNOWN); + } + + @Override + public Optional visit(LogicalTypeAnnotation.DateLogicalTypeAnnotation dateLogicalType) { + return of(SortOrder.SIGNED); + } + + @Override + public Optional visit(LogicalTypeAnnotation.EnumLogicalTypeAnnotation enumLogicalType) { + return of(SortOrder.UNSIGNED); + } + + @Override + public Optional visit(LogicalTypeAnnotation.BsonLogicalTypeAnnotation bsonLogicalType) { + return of(SortOrder.UNSIGNED); + } + + @Override + public Optional visit(LogicalTypeAnnotation.JsonLogicalTypeAnnotation jsonLogicalType) { + return of(SortOrder.UNSIGNED); + } + + @Override + public Optional visit(LogicalTypeAnnotation.StringLogicalTypeAnnotation stringLogicalType) { + return of(SortOrder.UNSIGNED); + } + + @Override + public Optional visit(LogicalTypeAnnotation.DecimalLogicalTypeAnnotation decimalLogicalType) { + return of(SortOrder.UNKNOWN); + } + + @Override + public Optional visit(LogicalTypeAnnotation.MapKeyValueTypeAnnotation mapKeyValueLogicalType) { + return of(SortOrder.UNKNOWN); + } + + @Override + public Optional visit(LogicalTypeAnnotation.MapLogicalTypeAnnotation mapLogicalType) { + return of(SortOrder.UNKNOWN); + } + + @Override + public Optional visit(LogicalTypeAnnotation.ListLogicalTypeAnnotation listLogicalType) { + return of(SortOrder.UNKNOWN); + } + + @Override + public Optional visit(LogicalTypeAnnotation.TimeLogicalTypeAnnotation timeLogicalType) { + return of(SortOrder.SIGNED); + } + + @Override + public Optional visit(LogicalTypeAnnotation.TimestampLogicalTypeAnnotation timestampLogicalType) { + return of(SortOrder.SIGNED); + } + }).orElse(defaultSortOrder(primitive.getPrimitiveTypeName())); } + return defaultSortOrder(primitive.getPrimitiveTypeName()); } @@ -894,6 +965,8 @@ private LogicalTypeAnnotation.TimeUnit convertTimeUnit(TimeUnit unit) { return LogicalTypeAnnotation.TimeUnit.MICROS; case MILLIS: return LogicalTypeAnnotation.TimeUnit.MILLIS; + case NANOS: + return LogicalTypeAnnotation.TimeUnit.NANOS; default: throw new RuntimeException("Unknown time unit " + unit); } @@ -1117,6 +1190,8 @@ public ParquetMetadata fromParquetMetadata(FileMetaData parquetMetadata) throws metaData.num_values, metaData.total_compressed_size, metaData.total_uncompressed_size); + column.setColumnIndexReference(toColumnIndexReference(columnChunk)); + column.setOffsetIndexReference(toOffsetIndexReference(columnChunk)); // TODO // index_page_offset // key_value_metadata @@ -1138,6 +1213,20 @@ public ParquetMetadata fromParquetMetadata(FileMetaData parquetMetadata) throws blocks); } + private static IndexReference toColumnIndexReference(ColumnChunk columnChunk) { + if (columnChunk.isSetColumn_index_offset() && columnChunk.isSetColumn_index_length()) { + return new IndexReference(columnChunk.getColumn_index_offset(), columnChunk.getColumn_index_length()); + } + return null; + } + + private static IndexReference toOffsetIndexReference(ColumnChunk columnChunk) { + if (columnChunk.isSetOffset_index_offset() && columnChunk.isSetOffset_index_length()) { + return new IndexReference(columnChunk.getOffset_index_offset(), columnChunk.getOffset_index_length()); + } + return null; + } + private static ColumnPath getPath(ColumnMetaData metaData) { String[] path = metaData.path_in_schema.toArray(new String[metaData.path_in_schema.size()]); return ColumnPath.get(path); @@ -1332,4 +1421,78 @@ public void writeDictionaryPageHeader( writePageHeader(pageHeader, to); } + private static BoundaryOrder toParquetBoundaryOrder( + org.apache.parquet.internal.column.columnindex.BoundaryOrder boundaryOrder) { + switch (boundaryOrder) { + case ASCENDING: + return BoundaryOrder.ASCENDING; + case DESCENDING: + return BoundaryOrder.DESCENDING; + case UNORDERED: + return BoundaryOrder.UNORDERED; + default: + throw new IllegalArgumentException("Unsupported boundary order: " + boundaryOrder); + } + } + + private static org.apache.parquet.internal.column.columnindex.BoundaryOrder fromParquetBoundaryOrder( + BoundaryOrder boundaryOrder) { + switch (boundaryOrder) { + case ASCENDING: + return org.apache.parquet.internal.column.columnindex.BoundaryOrder.ASCENDING; + case DESCENDING: + return org.apache.parquet.internal.column.columnindex.BoundaryOrder.DESCENDING; + case UNORDERED: + return org.apache.parquet.internal.column.columnindex.BoundaryOrder.UNORDERED; + default: + throw new IllegalArgumentException("Unsupported boundary order: " + boundaryOrder); + } + } + + public static ColumnIndex toParquetColumnIndex(PrimitiveType type, + org.apache.parquet.internal.column.columnindex.ColumnIndex columnIndex) { + if (!isMinMaxStatsSupported(type) || columnIndex == null) { + return null; + } + ColumnIndex parquetColumnIndex = new ColumnIndex( + columnIndex.getNullPages(), + columnIndex.getMinValues(), + columnIndex.getMaxValues(), + toParquetBoundaryOrder(columnIndex.getBoundaryOrder())); + parquetColumnIndex.setNull_counts(columnIndex.getNullCounts()); + return parquetColumnIndex; + } + + public static org.apache.parquet.internal.column.columnindex.ColumnIndex fromParquetColumnIndex(PrimitiveType type, + ColumnIndex parquetColumnIndex) { + if (!isMinMaxStatsSupported(type)) { + return null; + } + return ColumnIndexBuilder.build(type, + fromParquetBoundaryOrder(parquetColumnIndex.getBoundary_order()), + parquetColumnIndex.getNull_pages(), + parquetColumnIndex.getNull_counts(), + parquetColumnIndex.getMin_values(), + parquetColumnIndex.getMax_values()); + } + + public static OffsetIndex toParquetOffsetIndex(org.apache.parquet.internal.column.columnindex.OffsetIndex offsetIndex) { + List pageLocations = new ArrayList<>(offsetIndex.getPageCount()); + for (int i = 0, n = offsetIndex.getPageCount(); i < n; ++i) { + pageLocations.add(new PageLocation( + offsetIndex.getOffset(i), + offsetIndex.getCompressedPageSize(i), + offsetIndex.getFirstRowIndex(i))); + } + return new OffsetIndex(pageLocations); + } + + public static org.apache.parquet.internal.column.columnindex.OffsetIndex fromParquetOffsetIndex( + OffsetIndex parquetOffsetIndex) { + OffsetIndexBuilder builder = OffsetIndexBuilder.getBuilder(); + for (PageLocation pageLocation : parquetOffsetIndex.getPage_locations()) { + builder.add(pageLocation.getOffset(), pageLocation.getCompressed_page_size(), pageLocation.getFirst_row_index()); + } + return builder.build(); + } } diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/BloomFilterDataReader.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/BloomFilterDataReader.java index 6b861e55c5..96e258fe40 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/BloomFilterDataReader.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/BloomFilterDataReader.java @@ -17,11 +17,9 @@ * under the License. */ package org.apache.parquet.hadoop; - import java.io.IOException; import java.util.HashMap; import java.util.Map; - import org.apache.parquet.Strings; import org.apache.parquet.column.ColumnDescriptor; import org.apache.parquet.column.values.bloomfilter.BloomFilter; @@ -29,18 +27,15 @@ import org.apache.parquet.hadoop.metadata.BlockMetaData; import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; import org.apache.parquet.io.ParquetDecodingException; - /** * A {@link BloomFilterReader} implementation that reads Bloom filter data from * an open {@link ParquetFileReader}. * */ - public class BloomFilterDataReader implements BloomFilterReader { private final ParquetFileReader reader; private final Map columns; private final Map cache = new HashMap<>(); - public BloomFilterDataReader(ParquetFileReader fileReader, BlockMetaData block) { this.reader = fileReader; this.columns = new HashMap<>(); @@ -48,7 +43,6 @@ public BloomFilterDataReader(ParquetFileReader fileReader, BlockMetaData block) columns.put(column.getPath().toDotString(), column); } } - @Override public BloomFilter readBloomFilter(ColumnDescriptor descriptor) { String dotPath = Strings.join(descriptor.getPath(), "."); @@ -57,11 +51,9 @@ public BloomFilter readBloomFilter(ColumnDescriptor descriptor) { throw new ParquetDecodingException( "Cannot load Bloom filter data, unknown column: " + dotPath); } - if (cache.containsKey(dotPath)) { return cache.get(dotPath); } - try { synchronized (cache) { if (!cache.containsKey(dotPath)) { @@ -70,7 +62,6 @@ public BloomFilter readBloomFilter(ColumnDescriptor descriptor) { cache.put(dotPath, bloomFilter); } } - return cache.get(dotPath); } catch (IOException e) { throw new ParquetDecodingException( diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageReadStore.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageReadStore.java index 37dfd6d394..0dc71e0743 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageReadStore.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageReadStore.java @@ -18,13 +18,17 @@ */ package org.apache.parquet.hadoop; +import static org.apache.parquet.Ints.checkedCast; + import java.io.IOException; import java.util.HashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; - +import java.util.Optional; +import java.util.PrimitiveIterator; import org.apache.parquet.Ints; +import org.apache.parquet.bytes.BytesInput; import org.apache.parquet.column.ColumnDescriptor; import org.apache.parquet.column.page.DataPage; import org.apache.parquet.column.page.DataPageV1; @@ -33,9 +37,9 @@ import org.apache.parquet.column.page.DictionaryPageReadStore; import org.apache.parquet.column.page.PageReadStore; import org.apache.parquet.column.page.PageReader; -import org.apache.parquet.compression.CompressionCodecFactory; import org.apache.parquet.compression.CompressionCodecFactory.BytesInputDecompressor; -import org.apache.parquet.hadoop.CodecFactory.BytesDecompressor; +import org.apache.parquet.internal.column.columnindex.OffsetIndex; +import org.apache.parquet.internal.filter2.columnindex.RowRanges; import org.apache.parquet.io.ParquetDecodingException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -62,8 +66,13 @@ static final class ColumnChunkPageReader implements PageReader { private final long valueCount; private final List compressedPages; private final DictionaryPage compressedDictionaryPage; + // null means no page synchronization is required; firstRowIndex will not be returned by the pages + private final OffsetIndex offsetIndex; + private final long rowCount; + private int pageIndex = 0; - ColumnChunkPageReader(BytesInputDecompressor decompressor, List compressedPages, DictionaryPage compressedDictionaryPage) { + ColumnChunkPageReader(BytesInputDecompressor decompressor, List compressedPages, + DictionaryPage compressedDictionaryPage, OffsetIndex offsetIndex, long rowCount) { this.decompressor = decompressor; this.compressedPages = new LinkedList(compressedPages); this.compressedDictionaryPage = compressedDictionaryPage; @@ -72,6 +81,8 @@ static final class ColumnChunkPageReader implements PageReader { count += p.getValueCount(); } this.valueCount = count; + this.offsetIndex = offsetIndex; + this.rowCount = rowCount; } @Override @@ -85,18 +96,34 @@ public DataPage readPage() { return null; } DataPage compressedPage = compressedPages.remove(0); + final int currentPageIndex = pageIndex++; return compressedPage.accept(new DataPage.Visitor() { @Override public DataPage visit(DataPageV1 dataPageV1) { try { - return new DataPageV1( - decompressor.decompress(dataPageV1.getBytes(), dataPageV1.getUncompressedSize()), - dataPageV1.getValueCount(), - dataPageV1.getUncompressedSize(), - dataPageV1.getStatistics(), - dataPageV1.getRlEncoding(), - dataPageV1.getDlEncoding(), - dataPageV1.getValueEncoding()); + BytesInput decompressed = decompressor.decompress(dataPageV1.getBytes(), dataPageV1.getUncompressedSize()); + if (offsetIndex == null) { + return new DataPageV1( + decompressed, + dataPageV1.getValueCount(), + dataPageV1.getUncompressedSize(), + dataPageV1.getStatistics(), + dataPageV1.getRlEncoding(), + dataPageV1.getDlEncoding(), + dataPageV1.getValueEncoding()); + } else { + long firstRowIndex = offsetIndex.getFirstRowIndex(currentPageIndex); + return new DataPageV1( + decompressed, + dataPageV1.getValueCount(), + dataPageV1.getUncompressedSize(), + firstRowIndex, + checkedCast(offsetIndex.getLastRowIndex(currentPageIndex, rowCount) - firstRowIndex + 1), + dataPageV1.getStatistics(), + dataPageV1.getRlEncoding(), + dataPageV1.getDlEncoding(), + dataPageV1.getValueEncoding()); + } } catch (IOException e) { throw new ParquetDecodingException("could not decompress page", e); } @@ -105,23 +132,49 @@ public DataPage visit(DataPageV1 dataPageV1) { @Override public DataPage visit(DataPageV2 dataPageV2) { if (!dataPageV2.isCompressed()) { - return dataPageV2; + if (offsetIndex == null) { + return dataPageV2; + } else { + return DataPageV2.uncompressed( + dataPageV2.getRowCount(), + dataPageV2.getNullCount(), + dataPageV2.getValueCount(), + offsetIndex.getFirstRowIndex(currentPageIndex), + dataPageV2.getRepetitionLevels(), + dataPageV2.getDefinitionLevels(), + dataPageV2.getDataEncoding(), + dataPageV2.getData(), + dataPageV2.getStatistics()); + } } try { int uncompressedSize = Ints.checkedCast( dataPageV2.getUncompressedSize() - - dataPageV2.getDefinitionLevels().size() - - dataPageV2.getRepetitionLevels().size()); - return DataPageV2.uncompressed( - dataPageV2.getRowCount(), - dataPageV2.getNullCount(), - dataPageV2.getValueCount(), - dataPageV2.getRepetitionLevels(), - dataPageV2.getDefinitionLevels(), - dataPageV2.getDataEncoding(), - decompressor.decompress(dataPageV2.getData(), uncompressedSize), - dataPageV2.getStatistics() - ); + - dataPageV2.getDefinitionLevels().size() + - dataPageV2.getRepetitionLevels().size()); + BytesInput decompressed = decompressor.decompress(dataPageV2.getData(), uncompressedSize); + if (offsetIndex == null) { + return DataPageV2.uncompressed( + dataPageV2.getRowCount(), + dataPageV2.getNullCount(), + dataPageV2.getValueCount(), + dataPageV2.getRepetitionLevels(), + dataPageV2.getDefinitionLevels(), + dataPageV2.getDataEncoding(), + decompressed, + dataPageV2.getStatistics()); + } else { + return DataPageV2.uncompressed( + dataPageV2.getRowCount(), + dataPageV2.getNullCount(), + dataPageV2.getValueCount(), + offsetIndex.getFirstRowIndex(currentPageIndex), + dataPageV2.getRepetitionLevels(), + dataPageV2.getDefinitionLevels(), + dataPageV2.getDataEncoding(), + decompressed, + dataPageV2.getStatistics()); + } } catch (IOException e) { throw new ParquetDecodingException("could not decompress page", e); } @@ -147,9 +200,16 @@ public DictionaryPage readDictionaryPage() { private final Map readers = new HashMap(); private final long rowCount; + private final RowRanges rowRanges; public ColumnChunkPageReadStore(long rowCount) { this.rowCount = rowCount; + rowRanges = null; + } + + ColumnChunkPageReadStore(RowRanges rowRanges) { + this.rowRanges = rowRanges; + rowCount = rowRanges.rowCount(); } @Override @@ -170,6 +230,11 @@ public DictionaryPage readDictionaryPage(ColumnDescriptor descriptor) { return readers.get(descriptor).readDictionaryPage(); } + @Override + public Optional getRowIndexes() { + return rowRanges == null ? Optional.empty() : Optional.of(rowRanges.iterator()); + } + void addColumn(ColumnDescriptor path, ColumnChunkPageReader reader) { if (readers.put(path, reader) != null) { throw new RuntimeException(path+ " was added twice"); diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageWriteStore.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageWriteStore.java index caa41fc7c0..85bdbdbd9b 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageWriteStore.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageWriteStore.java @@ -28,7 +28,6 @@ import java.util.Set; import org.apache.parquet.bytes.BytesInput; -import org.apache.parquet.bytes.ByteBufferAllocator; import org.apache.parquet.bytes.ConcatenatingByteArrayCollector; import org.apache.parquet.column.ColumnDescriptor; import org.apache.parquet.column.Encoding; @@ -36,23 +35,22 @@ import org.apache.parquet.column.page.PageWriteStore; import org.apache.parquet.column.page.PageWriter; import org.apache.parquet.column.statistics.Statistics; -import org.apache.parquet.column.values.bloomfilter.BlockSplitBloomFilter; -import org.apache.parquet.column.values.bloomfilter.BloomFilter; -import org.apache.parquet.column.values.bloomfilter.BloomFilterWriteStore; -import org.apache.parquet.column.values.bloomfilter.BloomFilterWriter; import org.apache.parquet.format.converter.ParquetMetadataConverter; import org.apache.parquet.hadoop.CodecFactory.BytesCompressor; +import org.apache.parquet.internal.column.columnindex.ColumnIndexBuilder; +import org.apache.parquet.internal.column.columnindex.OffsetIndexBuilder; import org.apache.parquet.io.ParquetEncodingException; import org.apache.parquet.schema.MessageType; +import org.apache.parquet.bytes.ByteBufferAllocator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -class ColumnChunkPageWriteStore implements PageWriteStore, BloomFilterWriteStore { +class ColumnChunkPageWriteStore implements PageWriteStore { private static final Logger LOG = LoggerFactory.getLogger(ColumnChunkPageWriteStore.class); private static ParquetMetadataConverter parquetMetadataConverter = new ParquetMetadataConverter(); - private static final class ColumnChunkPageWriter implements PageWriter, BloomFilterWriter { + private static final class ColumnChunkPageWriter implements PageWriter { private final ColumnDescriptor path; private final BytesCompressor compressor; @@ -60,7 +58,6 @@ private static final class ColumnChunkPageWriter implements PageWriter, BloomFil private final ByteArrayOutputStream tempOutputStream = new ByteArrayOutputStream(); private final ConcatenatingByteArrayCollector buf; private DictionaryPage dictionaryPage; - private BloomFilter bloomFilter; private long uncompressedLength; private long compressedLength; @@ -72,21 +69,38 @@ private static final class ColumnChunkPageWriter implements PageWriter, BloomFil private Set dlEncodings = new HashSet(); private List dataEncodings = new ArrayList(); + private ColumnIndexBuilder columnIndexBuilder; + private OffsetIndexBuilder offsetIndexBuilder; private Statistics totalStatistics; private final ByteBufferAllocator allocator; private ColumnChunkPageWriter(ColumnDescriptor path, BytesCompressor compressor, - ByteBufferAllocator allocator) { + ByteBufferAllocator allocator, + int columnIndexTruncateLength) { this.path = path; this.compressor = compressor; this.allocator = allocator; this.buf = new ConcatenatingByteArrayCollector(); + this.columnIndexBuilder = ColumnIndexBuilder.getBuilder(path.getPrimitiveType(), columnIndexTruncateLength); + this.offsetIndexBuilder = OffsetIndexBuilder.getBuilder(); + } + + @Override + @Deprecated + public void writePage(BytesInput bytesInput, int valueCount, Statistics statistics, Encoding rlEncoding, + Encoding dlEncoding, Encoding valuesEncoding) throws IOException { + // Setting the builders to the no-op ones so no column/offset indexes will be written for this column chunk + columnIndexBuilder = ColumnIndexBuilder.getNoOpBuilder(); + offsetIndexBuilder = OffsetIndexBuilder.getNoOpBuilder(); + + writePage(bytesInput, valueCount, -1, statistics, rlEncoding, dlEncoding, valuesEncoding); } @Override public void writePage(BytesInput bytes, int valueCount, + int rowCount, Statistics statistics, Encoding rlEncoding, Encoding dlEncoding, @@ -126,6 +140,9 @@ public void writePage(BytesInput bytes, totalStatistics.mergeStatistics(statistics); } + columnIndexBuilder.add(statistics); + offsetIndexBuilder.add(toIntWithCheck(tempOutputStream.size() + compressedSize), rowCount); + // by concatenating before collecting instead of collecting twice, // we only allocate one buffer to copy into instead of multiple. buf.collect(BytesInput.concat(BytesInput.from(tempOutputStream), compressedBytes)); @@ -171,6 +188,9 @@ public void writePageV2( totalStatistics.mergeStatistics(statistics); } + columnIndexBuilder.add(statistics); + offsetIndexBuilder.add(toIntWithCheck((long) tempOutputStream.size() + compressedSize), rowCount); + // by concatenating before collecting instead of collecting twice, // we only allocate one buffer to copy into instead of multiple. buf.collect( @@ -198,18 +218,20 @@ public long getMemSize() { } public void writeToFileWriter(ParquetFileWriter writer) throws IOException { - writer.startColumn(path, totalValueCount, compressor.getCodecName()); - if (bloomFilter != null) { - writer.writeBloomFilter(bloomFilter); - } - - if (dictionaryPage != null) { - writer.writeDictionaryPage(dictionaryPage); - // tracking the dictionary encoding is handled in writeDictionaryPage - } - writer.writeDataPages(buf, uncompressedLength, compressedLength, totalStatistics, - rlEncodings, dlEncodings, dataEncodings); - writer.endColumn(); + writer.writeColumnChunk( + path, + totalValueCount, + compressor.getCodecName(), + dictionaryPage, + buf, + uncompressedLength, + compressedLength, + totalStatistics, + columnIndexBuilder, + offsetIndexBuilder, + rlEncodings, + dlEncodings, + dataEncodings); if (LOG.isDebugEnabled()) { LOG.debug( String.format( @@ -247,20 +269,16 @@ public String memUsageString(String prefix) { return buf.memUsageString(prefix + " ColumnChunkPageWriter"); } - @Override - public void writeBloomFilter(BloomFilter bloomFilter) { - this.bloomFilter = bloomFilter; - } - } private final Map writers = new HashMap(); private final MessageType schema; - public ColumnChunkPageWriteStore(BytesCompressor compressor, MessageType schema, ByteBufferAllocator allocator) { + public ColumnChunkPageWriteStore(BytesCompressor compressor, MessageType schema, ByteBufferAllocator allocator, + int columnIndexTruncateLength) { this.schema = schema; for (ColumnDescriptor path : schema.getColumns()) { - writers.put(path, new ColumnChunkPageWriter(path, compressor, allocator)); + writers.put(path, new ColumnChunkPageWriter(path, compressor, allocator, columnIndexTruncateLength)); } } @@ -269,11 +287,6 @@ public PageWriter getPageWriter(ColumnDescriptor path) { return writers.get(path); } - @Override - public BloomFilterWriter getBloomFilterWriter(ColumnDescriptor path) { - return writers.get(path); - } - public void flushToFileWriter(ParquetFileWriter writer) throws IOException { for (ColumnDescriptor path : schema.getColumns()) { ColumnChunkPageWriter pageWriter = writers.get(path); @@ -281,4 +294,9 @@ public void flushToFileWriter(ParquetFileWriter writer) throws IOException { } } + void flushToFileWriter(ColumnDescriptor path, ParquetFileWriter writer) throws IOException { + ColumnChunkPageWriter pageWriter = writers.get(path); + pageWriter.writeToFileWriter(writer); + } + } diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnIndexFilterUtils.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnIndexFilterUtils.java new file mode 100644 index 0000000000..448515e2a9 --- /dev/null +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnIndexFilterUtils.java @@ -0,0 +1,157 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.hadoop; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Formatter; +import java.util.List; + +import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; +import org.apache.parquet.internal.column.columnindex.OffsetIndex; +import org.apache.parquet.internal.filter2.columnindex.RowRanges; + +import it.unimi.dsi.fastutil.ints.IntArrayList; +import it.unimi.dsi.fastutil.ints.IntList; + +/** + * Internal utility class to help at column index based filtering. + */ +class ColumnIndexFilterUtils { + static class OffsetRange { + private final long offset; + private long length; + + private OffsetRange(long offset, int length) { + this.offset = offset; + this.length = length; + } + + long getOffset() { + return offset; + } + + long getLength() { + return length; + } + + private boolean extend(long offset, int length) { + if (this.offset + this.length == offset) { + this.length += length; + return true; + } else { + return false; + } + } + } + + private static class FilteredOffsetIndex implements OffsetIndex { + private final OffsetIndex offsetIndex; + private final int[] indexMap; + + private FilteredOffsetIndex(OffsetIndex offsetIndex, int[] indexMap) { + this.offsetIndex = offsetIndex; + this.indexMap = indexMap; + } + + @Override + public int getPageCount() { + return indexMap.length; + } + + @Override + public long getOffset(int pageIndex) { + return offsetIndex.getOffset(indexMap[pageIndex]); + } + + @Override + public int getCompressedPageSize(int pageIndex) { + return offsetIndex.getCompressedPageSize(indexMap[pageIndex]); + } + + @Override + public long getFirstRowIndex(int pageIndex) { + return offsetIndex.getFirstRowIndex(indexMap[pageIndex]); + } + + @Override + public long getLastRowIndex(int pageIndex, long totalRowCount) { + int nextIndex = indexMap[pageIndex] + 1; + return (nextIndex >= offsetIndex.getPageCount() ? totalRowCount : offsetIndex.getFirstRowIndex(nextIndex)) - 1; + } + + @Override + public String toString() { + try (Formatter formatter = new Formatter()) { + formatter.format("%-12s %20s %16s %20s\n", "", "offset", "compressed size", "first row index"); + for (int i = 0, n = offsetIndex.getPageCount(); i < n; ++i) { + int index = Arrays.binarySearch(indexMap, i); + boolean isHidden = index < 0; + formatter.format("%spage-%-5d %20d %16d %20d\n", + isHidden ? "- " : " ", + isHidden ? i : index, + offsetIndex.getOffset(i), + offsetIndex.getCompressedPageSize(i), + offsetIndex.getFirstRowIndex(i)); + } + return formatter.toString(); + } + } + } + + /* + * Returns the filtered offset index containing only the pages which are overlapping with rowRanges. + */ + static OffsetIndex filterOffsetIndex(OffsetIndex offsetIndex, RowRanges rowRanges, long totalRowCount) { + IntList indexMap = new IntArrayList(); + for (int i = 0, n = offsetIndex.getPageCount(); i < n; ++i) { + long from = offsetIndex.getFirstRowIndex(i); + if (rowRanges.isOverlapping(from, offsetIndex.getLastRowIndex(i, totalRowCount))) { + indexMap.add(i); + } + } + return new FilteredOffsetIndex(offsetIndex, indexMap.toIntArray()); + } + + static List calculateOffsetRanges(OffsetIndex offsetIndex, ColumnChunkMetaData cm, + long firstPageOffset) { + List ranges = new ArrayList<>(); + int n = offsetIndex.getPageCount(); + if (n > 0) { + OffsetRange currentRange = null; + + // Add a range for the dictionary page if required + long rowGroupOffset = cm.getStartingPos(); + if (rowGroupOffset < firstPageOffset) { + currentRange = new OffsetRange(rowGroupOffset, (int) (firstPageOffset - rowGroupOffset)); + ranges.add(currentRange); + } + + for (int i = 0; i < n; ++i) { + long offset = offsetIndex.getOffset(i); + int length = offsetIndex.getCompressedPageSize(i); + if (currentRange == null || !currentRange.extend(offset, length)) { + currentRange = new OffsetRange(offset, length); + ranges.add(currentRange); + } + } + } + return ranges; + } +} diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnIndexStoreImpl.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnIndexStoreImpl.java new file mode 100644 index 0000000000..684c5f2114 --- /dev/null +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnIndexStoreImpl.java @@ -0,0 +1,155 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.hadoop; + +import static java.util.Collections.emptySet; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; +import java.util.Set; + +import org.apache.parquet.hadoop.metadata.BlockMetaData; +import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; +import org.apache.parquet.hadoop.metadata.ColumnPath; +import org.apache.parquet.internal.column.columnindex.ColumnIndex; +import org.apache.parquet.internal.column.columnindex.OffsetIndex; +import org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Internal implementation of {@link ColumnIndexStore}. + */ +class ColumnIndexStoreImpl implements ColumnIndexStore { + + private interface IndexStore { + ColumnIndex getColumnIndex(); + + OffsetIndex getOffsetIndex(); + } + + private class IndexStoreImpl implements IndexStore { + private final ColumnChunkMetaData meta; + private ColumnIndex columnIndex; + private boolean columnIndexRead; + private final OffsetIndex offsetIndex; + + IndexStoreImpl(ColumnChunkMetaData meta) { + this.meta = meta; + OffsetIndex oi; + try { + oi = reader.readOffsetIndex(meta); + } catch (IOException e) { + // If the I/O issue still stands it will fail the reading later; + // otherwise we fail the filtering only with a missing offset index. + LOGGER.warn("Unable to read offset index for column {}", meta.getPath(), e); + oi = null; + } + if (oi == null) { + throw new MissingOffsetIndexException(meta.getPath()); + } + offsetIndex = oi; + } + + @Override + public ColumnIndex getColumnIndex() { + if (!columnIndexRead) { + try { + columnIndex = reader.readColumnIndex(meta); + } catch (IOException e) { + // If the I/O issue still stands it will fail the reading later; + // otherwise we fail the filtering only with a missing column index. + LOGGER.warn("Unable to read column index for column {}", meta.getPath(), e); + } + columnIndexRead = true; + } + return columnIndex; + } + + @Override + public OffsetIndex getOffsetIndex() { + return offsetIndex; + } + } + + private static final Logger LOGGER = LoggerFactory.getLogger(ColumnIndexStoreImpl.class); + // Used for columns are not in this parquet file + private static final IndexStore MISSING_INDEX_STORE = new IndexStore() { + @Override + public ColumnIndex getColumnIndex() { + return null; + } + + @Override + public OffsetIndex getOffsetIndex() { + return null; + } + }; + private static final ColumnIndexStoreImpl EMPTY = new ColumnIndexStoreImpl(null, new BlockMetaData(), emptySet()) { + @Override + public ColumnIndex getColumnIndex(ColumnPath column) { + return null; + } + + @Override + public OffsetIndex getOffsetIndex(ColumnPath column) { + throw new MissingOffsetIndexException(column); + } + }; + + private final ParquetFileReader reader; + private final Map store; + + /* + * Creates a column index store which lazily reads column/offset indexes for the columns in paths. (paths are the set + * of columns used for the projection) + */ + static ColumnIndexStore create(ParquetFileReader reader, BlockMetaData block, Set paths) { + try { + return new ColumnIndexStoreImpl(reader, block, paths); + } catch (MissingOffsetIndexException e) { + return EMPTY; + } + } + + private ColumnIndexStoreImpl(ParquetFileReader reader, BlockMetaData block, Set paths) { + // TODO[GS]: Offset index for every paths will be required; pre-read the consecutive ones at once? + // TODO[GS]: Pre-read column index based on filter? + this.reader = reader; + Map store = new HashMap<>(); + for (ColumnChunkMetaData column : block.getColumns()) { + ColumnPath path = column.getPath(); + if (paths.contains(path)) { + store.put(path, new IndexStoreImpl(column)); + } + } + this.store = store; + } + + @Override + public ColumnIndex getColumnIndex(ColumnPath column) { + return store.getOrDefault(column, MISSING_INDEX_STORE).getColumnIndex(); + } + + @Override + public OffsetIndex getOffsetIndex(ColumnPath column) { + return store.getOrDefault(column, MISSING_INDEX_STORE).getOffsetIndex(); + } +} diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/InternalParquetRecordReader.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/InternalParquetRecordReader.java index a048878693..e57f3cbcee 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/InternalParquetRecordReader.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/InternalParquetRecordReader.java @@ -124,7 +124,7 @@ private void checkRead() throws IOException { LOG.info("at row " + current + ". reading next block"); long t0 = System.currentTimeMillis(); - PageReadStore pages = reader.readNextRowGroup(); + PageReadStore pages = reader.readNextFilteredRowGroup(); if (pages == null) { throw new IOException("expecting more rows but reached last block. Read " + current + " out of " + total); } @@ -182,7 +182,7 @@ public void initialize(ParquetFileReader reader, ParquetReadOptions options) { this.columnCount = requestedSchema.getPaths().size(); this.recordConverter = readSupport.prepareForRead(conf, fileMetadata, fileSchema, readContext); this.strictTypeChecking = options.isEnabled(STRICT_TYPE_CHECKING, true); - this.total = reader.getRecordCount(); + this.total = reader.getFilteredRecordCount(); this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(options, total); this.filterRecords = options.useRecordFilter(); reader.setRequestedSchema(requestedSchema); @@ -204,7 +204,7 @@ public void initialize(ParquetFileReader reader, Configuration configuration) this.recordConverter = readSupport.prepareForRead( configuration, fileMetadata, fileSchema, readContext); this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true); - this.total = reader.getRecordCount(); + this.total = reader.getFilteredRecordCount(); this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total); this.filterRecords = configuration.getBoolean(RECORD_FILTERING_ENABLED, true); reader.setRequestedSchema(requestedSchema); diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/InternalParquetRecordWriter.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/InternalParquetRecordWriter.java index d9e9b5e15e..d8af379d13 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/InternalParquetRecordWriter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/InternalParquetRecordWriter.java @@ -20,7 +20,6 @@ import static java.lang.Math.max; import static java.lang.Math.min; -import static java.lang.String.format; import static org.apache.parquet.Preconditions.checkNotNull; import java.io.IOException; @@ -102,7 +101,8 @@ public ParquetMetadata getFooter() { } private void initStore() { - pageStore = new ColumnChunkPageWriteStore(compressor, schema, props.getAllocator()); + pageStore = new ColumnChunkPageWriteStore(compressor, schema, props.getAllocator(), + props.getColumnIndexTruncateLength()); columnStore = props.newColumnWriteStore(schema, pageStore); MessageColumnIO columnIO = new ColumnIOFactory(validating).getColumnIO(schema); this.recordConsumer = columnIO.getRecordWriter(columnStore); @@ -144,7 +144,7 @@ private void checkBlockSizeReached() throws IOException { // flush the row group if it is within ~2 records of the limit // it is much better to be slightly under size than to be over at all if (memSize > (nextRowGroupSize - 2 * recordSize)) { - LOG.info("mem size {} > {}: flushing {} records to disk.", memSize, nextRowGroupSize, recordCount); + LOG.debug("mem size {} > {}: flushing {} records to disk.", memSize, nextRowGroupSize, recordCount); flushRowGroupToStore(); initStore(); recordCountForNextMemCheck = min(max(MINIMUM_RECORD_COUNT_FOR_CHECK, recordCount / 2), MAXIMUM_RECORD_COUNT_FOR_CHECK); @@ -162,7 +162,7 @@ private void checkBlockSizeReached() throws IOException { private void flushRowGroupToStore() throws IOException { recordConsumer.flush(); - LOG.info("Flushing mem columnStore to file. allocated memory: {}", columnStore.getAllocatedSize()); + LOG.debug("Flushing mem columnStore to file. allocated memory: {}", columnStore.getAllocatedSize()); if (columnStore.getAllocatedSize() > (3 * rowGroupSizeThreshold)) { LOG.warn("Too much memory used: {}", columnStore.memUsageString()); } diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java index 3975bf9f48..8e76634901 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java @@ -23,6 +23,8 @@ import static org.apache.parquet.filter2.compat.RowGroupFilter.FilterLevel.STATISTICS; import static org.apache.parquet.format.converter.ParquetMetadataConverter.NO_FILTER; import static org.apache.parquet.format.converter.ParquetMetadataConverter.SKIP_ROW_GROUPS; +import static org.apache.parquet.hadoop.ColumnIndexFilterUtils.calculateOffsetRanges; +import static org.apache.parquet.hadoop.ColumnIndexFilterUtils.filterOffsetIndex; import static org.apache.parquet.hadoop.ParquetFileWriter.MAGIC; import static org.apache.parquet.hadoop.ParquetFileWriter.PARQUET_COMMON_METADATA_FILE; import static org.apache.parquet.hadoop.ParquetFileWriter.PARQUET_METADATA_FILE; @@ -42,36 +44,38 @@ import java.util.List; import java.util.Map; import java.util.Map.Entry; +import java.util.Optional; import java.util.Set; import java.util.concurrent.Callable; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; +import java.util.stream.Collectors; +import java.util.stream.Stream; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; - +import org.apache.parquet.HadoopReadOptions; import org.apache.parquet.ParquetReadOptions; import org.apache.parquet.bytes.ByteBufferInputStream; import org.apache.parquet.bytes.BytesInput; import org.apache.parquet.column.ColumnDescriptor; import org.apache.parquet.column.Encoding; -import org.apache.parquet.column.page.DictionaryPageReadStore; -import org.apache.parquet.column.values.bloomfilter.BlockSplitBloomFilter; -import org.apache.parquet.compression.CompressionCodecFactory.BytesInputDecompressor; -import org.apache.parquet.filter2.compat.FilterCompat; -import org.apache.parquet.filter2.compat.RowGroupFilter; - import org.apache.parquet.column.page.DataPage; import org.apache.parquet.column.page.DataPageV1; import org.apache.parquet.column.page.DataPageV2; import org.apache.parquet.column.page.DictionaryPage; +import org.apache.parquet.column.page.DictionaryPageReadStore; +import org.apache.parquet.column.page.PageReader; import org.apache.parquet.column.page.PageReadStore; +import org.apache.parquet.column.values.bloomfilter.BlockSplitBloomFilter; import org.apache.parquet.column.values.bloomfilter.BloomFilter; -import org.apache.parquet.hadoop.metadata.ColumnPath; +import org.apache.parquet.compression.CompressionCodecFactory.BytesInputDecompressor; +import org.apache.parquet.filter2.compat.FilterCompat; +import org.apache.parquet.filter2.compat.RowGroupFilter; import org.apache.parquet.format.DataPageHeader; import org.apache.parquet.format.DataPageHeaderV2; import org.apache.parquet.format.DictionaryPageHeader; @@ -80,19 +84,27 @@ import org.apache.parquet.format.converter.ParquetMetadataConverter; import org.apache.parquet.format.converter.ParquetMetadataConverter.MetadataFilter; import org.apache.parquet.hadoop.ColumnChunkPageReadStore.ColumnChunkPageReader; +import org.apache.parquet.hadoop.ColumnIndexFilterUtils.OffsetRange; import org.apache.parquet.hadoop.metadata.BlockMetaData; import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; +import org.apache.parquet.hadoop.metadata.ColumnPath; import org.apache.parquet.hadoop.metadata.FileMetaData; import org.apache.parquet.hadoop.metadata.ParquetMetadata; import org.apache.parquet.hadoop.util.HadoopInputFile; -import org.apache.parquet.HadoopReadOptions; import org.apache.parquet.hadoop.util.HiddenFileFilter; import org.apache.parquet.hadoop.util.counters.BenchmarkCounter; -import org.apache.parquet.io.ParquetDecodingException; +import org.apache.parquet.internal.column.columnindex.ColumnIndex; +import org.apache.parquet.internal.column.columnindex.OffsetIndex; +import org.apache.parquet.internal.filter2.columnindex.ColumnIndexFilter; +import org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore; +import org.apache.parquet.internal.filter2.columnindex.RowRanges; +import org.apache.parquet.internal.hadoop.metadata.IndexReference; import org.apache.parquet.io.InputFile; +import org.apache.parquet.io.ParquetDecodingException; import org.apache.parquet.io.SeekableInputStream; import org.apache.parquet.schema.MessageType; import org.apache.parquet.schema.PrimitiveType; +import org.apache.yetus.audience.InterfaceAudience.Private; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -605,6 +617,8 @@ public static ParquetFileReader open(InputFile file, ParquetReadOptions options) private final Map paths = new HashMap<>(); private final FileMetaData fileMetaData; // may be null private final List blocks; + private final List blockIndexStores; + private final List blockRowRanges; // not final. in some cases, this may be lazily loaded for backward-compat. private ParquetMetadata footer; @@ -646,6 +660,8 @@ public ParquetFileReader( this.f = file.newStream(); this.options = HadoopReadOptions.builder(configuration).build(); this.blocks = filterRowGroups(blocks); + this.blockIndexStores = listWithNulls(this.blocks.size()); + this.blockRowRanges = listWithNulls(this.blocks.size()); for (ColumnDescriptor col : columns) { paths.put(ColumnPath.get(col.getPath()), col); } @@ -680,6 +696,8 @@ public ParquetFileReader(Configuration conf, Path file, ParquetMetadata footer) this.footer = footer; this.fileMetaData = footer.getFileMetaData(); this.blocks = filterRowGroups(footer.getBlocks()); + this.blockIndexStores = listWithNulls(this.blocks.size()); + this.blockRowRanges = listWithNulls(this.blocks.size()); for (ColumnDescriptor col : footer.getFileMetaData().getSchema().getColumns()) { paths.put(ColumnPath.get(col.getPath()), col); } @@ -700,11 +718,17 @@ public ParquetFileReader(InputFile file, ParquetReadOptions options) throws IOEx } this.fileMetaData = footer.getFileMetaData(); this.blocks = filterRowGroups(footer.getBlocks()); + this.blockIndexStores = listWithNulls(this.blocks.size()); + this.blockRowRanges = listWithNulls(this.blocks.size()); for (ColumnDescriptor col : footer.getFileMetaData().getSchema().getColumns()) { paths.put(ColumnPath.get(col.getPath()), col); } } + private static List listWithNulls(int size) { + return Stream.generate(() -> (T) null).limit(size).collect(Collectors.toCollection(ArrayList::new)); + } + public ParquetMetadata getFooter() { if (footer == null) { try { @@ -732,6 +756,17 @@ public long getRecordCount() { return total; } + long getFilteredRecordCount() { + if (!options.useColumnIndexFilter()) { + return getRecordCount(); + } + long total = 0; + for (int i = 0, n = blocks.size(); i < n; ++i) { + total += getRowRanges(i).rowCount(); + } + return total; + } + /** * @return the path for this file * @deprecated will be removed in 2.0.0; use {@link #getFile()} instead @@ -794,30 +829,111 @@ public PageReadStore readNextRowGroup() throws IOException { throw new RuntimeException("Illegal row group of 0 rows"); } this.currentRowGroup = new ColumnChunkPageReadStore(block.getRowCount()); - // prepare the list of consecutive chunks to read them in one scan - List allChunks = new ArrayList(); - ConsecutiveChunkList currentChunks = null; + // prepare the list of consecutive parts to read them in one scan + List allParts = new ArrayList(); + ConsecutivePartList currentParts = null; for (ColumnChunkMetaData mc : block.getColumns()) { ColumnPath pathKey = mc.getPath(); BenchmarkCounter.incrementTotalBytes(mc.getTotalSize()); ColumnDescriptor columnDescriptor = paths.get(pathKey); if (columnDescriptor != null) { long startingPos = mc.getStartingPos(); - // first chunk or not consecutive => new list - if (currentChunks == null || currentChunks.endPos() != startingPos) { - currentChunks = new ConsecutiveChunkList(startingPos); - allChunks.add(currentChunks); + // first part or not consecutive => new list + if (currentParts == null || currentParts.endPos() != startingPos) { + currentParts = new ConsecutivePartList(startingPos); + allParts.add(currentParts); } - currentChunks.addChunk(new ChunkDescriptor(columnDescriptor, mc, startingPos, (int)mc.getTotalSize())); + currentParts.addChunk(new ChunkDescriptor(columnDescriptor, mc, startingPos, (int)mc.getTotalSize())); } } // actually read all the chunks - for (ConsecutiveChunkList consecutiveChunks : allChunks) { - final List chunks = consecutiveChunks.readAll(f); - for (Chunk chunk : chunks) { - currentRowGroup.addColumn(chunk.descriptor.col, chunk.readAllPages()); + ChunkListBuilder builder = new ChunkListBuilder(); + for (ConsecutivePartList consecutiveChunks : allParts) { + consecutiveChunks.readAll(f, builder); + } + for (Chunk chunk : builder.build()) { + currentRowGroup.addColumn(chunk.descriptor.col, chunk.readAllPages()); + } + + // avoid re-reading bytes the dictionary reader is used after this call + if (nextDictionaryReader != null) { + nextDictionaryReader.setRowGroup(currentRowGroup); + } + + advanceToNextBlock(); + + return currentRowGroup; + } + + /** + * Reads all the columns requested from the row group at the current file position. It may skip specific pages based + * on the column indexes according to the actual filter. As the rows are not aligned among the pages of the different + * columns row synchronization might be required. + * + * @return the PageReadStore which can provide PageReaders for each column + * @throws IOException + * if any I/O error occurs while reading + * @see {@link PageReadStore#isInPageFilteringMode()} + */ + public PageReadStore readNextFilteredRowGroup() throws IOException { + if (currentBlock == blocks.size()) { + return null; + } + if (!options.useColumnIndexFilter()) { + return readNextRowGroup(); + } + BlockMetaData block = blocks.get(currentBlock); + if (block.getRowCount() == 0) { + throw new RuntimeException("Illegal row group of 0 rows"); + } + ColumnIndexStore ciStore = getColumnIndexStore(currentBlock); + RowRanges rowRanges = getRowRanges(currentBlock); + long rowCount = rowRanges.rowCount(); + if (rowCount == 0) { + // There are no matching rows -> skipping this row-group + advanceToNextBlock(); + return readNextFilteredRowGroup(); + } + if (rowCount == block.getRowCount()) { + // All rows are matching -> fall back to the non-filtering path + return readNextRowGroup(); + } + + this.currentRowGroup = new ColumnChunkPageReadStore(rowRanges); + // prepare the list of consecutive parts to read them in one scan + ChunkListBuilder builder = new ChunkListBuilder(); + List allParts = new ArrayList(); + ConsecutivePartList currentParts = null; + for (ColumnChunkMetaData mc : block.getColumns()) { + ColumnPath pathKey = mc.getPath(); + ColumnDescriptor columnDescriptor = paths.get(pathKey); + if (columnDescriptor != null) { + OffsetIndex offsetIndex = ciStore.getOffsetIndex(mc.getPath()); + + OffsetIndex filteredOffsetIndex = filterOffsetIndex(offsetIndex, rowRanges, + block.getRowCount()); + for (OffsetRange range : calculateOffsetRanges(filteredOffsetIndex, mc, offsetIndex.getOffset(0))) { + BenchmarkCounter.incrementTotalBytes(range.getLength()); + long startingPos = range.getOffset(); + // first part or not consecutive => new list + if (currentParts == null || currentParts.endPos() != startingPos) { + currentParts = new ConsecutivePartList(startingPos); + allParts.add(currentParts); + } + ChunkDescriptor chunkDescriptor = new ChunkDescriptor(columnDescriptor, mc, startingPos, + (int) range.getLength()); + currentParts.addChunk(chunkDescriptor); + builder.setOffsetIndex(chunkDescriptor, filteredOffsetIndex); + } } } + // actually read all the chunks + for (ConsecutivePartList consecutiveChunks : allParts) { + consecutiveChunks.readAll(f, builder); + } + for (Chunk chunk : builder.build()) { + currentRowGroup.addColumn(chunk.descriptor.col, chunk.readAllPages()); + } // avoid re-reading bytes the dictionary reader is used after this call if (nextDictionaryReader != null) { @@ -829,6 +945,25 @@ public PageReadStore readNextRowGroup() throws IOException { return currentRowGroup; } + private ColumnIndexStore getColumnIndexStore(int blockIndex) { + ColumnIndexStore ciStore = blockIndexStores.get(blockIndex); + if (ciStore == null) { + ciStore = ColumnIndexStoreImpl.create(this, blocks.get(blockIndex), paths.keySet()); + blockIndexStores.set(blockIndex, ciStore); + } + return ciStore; + } + + private RowRanges getRowRanges(int blockIndex) { + RowRanges rowRanges = blockRowRanges.get(blockIndex); + if (rowRanges == null) { + rowRanges = ColumnIndexFilter.calculateRowRanges(options.getRecordFilter(), getColumnIndexStore(blockIndex), + paths.keySet(), blocks.get(blockIndex).getRowCount()); + blockRowRanges.set(blockIndex, rowRanges); + } + return rowRanges; + } + public boolean skipNextRowGroup() { return advanceToNextBlock(); } @@ -863,9 +998,6 @@ public DictionaryPageReader getDictionaryReader(BlockMetaData block) { return new DictionaryPageReader(this, block); } - public BloomFilterDataReader getBloomFilterDataReader(BlockMetaData block) { - return new BloomFilterDataReader(this, block); - } /** * Reads and decompresses a dictionary page for the given column chunk. * @@ -917,6 +1049,10 @@ private DictionaryPage readCompressedDictionary( converter.getEncoding(dictHeader.getEncoding())); } + public BloomFilterDataReader getBloomFilterDataReader(BlockMetaData block) { + return new BloomFilterDataReader(this, block); + } + /** * Reads Bloom filter data for the given column chunk. * @@ -926,26 +1062,55 @@ private DictionaryPage readCompressedDictionary( */ public BloomFilter readBloomFilter(ColumnChunkMetaData meta) throws IOException { long bloomFilterOffset = meta.getBloomFilterOffset(); - if (bloomFilterOffset == Long.MAX_VALUE) return null; f.seek(bloomFilterOffset); - // Read Bloom filter data header. byte[] bytes = new byte[BlockSplitBloomFilter.HEADER_SIZE]; f.read(bytes); ByteBuffer bloomHeader = ByteBuffer.wrap(bytes); IntBuffer headerBuffer = bloomHeader.order(ByteOrder.LITTLE_ENDIAN).asIntBuffer(); int numBytes = headerBuffer.get(); - BloomFilter.HashStrategy hash = BloomFilter.HashStrategy.values()[headerBuffer.get()]; BloomFilter.Algorithm algorithm = BloomFilter.Algorithm.values()[headerBuffer.get()]; - byte[] bitset = new byte[numBytes]; f.readFully(bitset); - return new BlockSplitBloomFilter(bitset); } + /** + * @param column + * the column chunk which the column index is to be returned for + * @return the column index for the specified column chunk or {@code null} if there is no index + * @throws IOException + * if any I/O error occurs during reading the file + */ + @Private + public ColumnIndex readColumnIndex(ColumnChunkMetaData column) throws IOException { + IndexReference ref = column.getColumnIndexReference(); + if (ref == null) { + return null; + } + f.seek(ref.getOffset()); + return ParquetMetadataConverter.fromParquetColumnIndex(column.getPrimitiveType(), Util.readColumnIndex(f)); + } + + /** + * @param column + * the column chunk which the offset index is to be returned for + * @return the offset index for the specified column chunk or {@code null} if there is no index + * @throws IOException + * if any I/O error occurs during reading the file + */ + @Private + public OffsetIndex readOffsetIndex(ColumnChunkMetaData column) throws IOException { + IndexReference ref = column.getOffsetIndexReference(); + if (ref == null) { + return null; + } + f.seek(ref.getOffset()); + return ParquetMetadataConverter.fromParquetOffsetIndex(Util.readOffsetIndex(f)); + } + @Override public void close() throws IOException { try { @@ -957,6 +1122,57 @@ public void close() throws IOException { } } + /* + * Builder to concatenate the buffers of the discontinuous parts for the same column. These parts are generated as a + * result of the column-index based filtering when some pages might be skipped at reading. + */ + private class ChunkListBuilder { + private class ChunkData { + final List buffers = new ArrayList<>(); + OffsetIndex offsetIndex; + } + + private final Map map = new HashMap<>(); + private ChunkDescriptor lastDescriptor; + private SeekableInputStream f; + + void add(ChunkDescriptor descriptor, List buffers, SeekableInputStream f) { + ChunkData data = map.get(descriptor); + if (data == null) { + data = new ChunkData(); + map.put(descriptor, data); + } + data.buffers.addAll(buffers); + + lastDescriptor = descriptor; + this.f = f; + } + + void setOffsetIndex(ChunkDescriptor descriptor, OffsetIndex offsetIndex) { + ChunkData data = map.get(descriptor); + if (data == null) { + data = new ChunkData(); + map.put(descriptor, data); + } + data.offsetIndex = offsetIndex; + } + + List build() { + List chunks = new ArrayList<>(); + for (Entry entry : map.entrySet()) { + ChunkDescriptor descriptor = entry.getKey(); + ChunkData data = entry.getValue(); + if (descriptor.equals(lastDescriptor)) { + // because of a bug, the last chunk might be larger than descriptor.size + chunks.add(new WorkaroundChunk(lastDescriptor, data.buffers, f, data.offsetIndex)); + } else { + chunks.add(new Chunk(descriptor, data.buffers, data.offsetIndex)); + } + } + return chunks; + } + } + /** * The data for a column chunk */ @@ -964,15 +1180,17 @@ private class Chunk { protected final ChunkDescriptor descriptor; protected final ByteBufferInputStream stream; + final OffsetIndex offsetIndex; /** - * * @param descriptor descriptor for the chunk * @param buffers ByteBuffers that contain the chunk + * @param offsetIndex the offset index for this column; might be null */ - public Chunk(ChunkDescriptor descriptor, List buffers) { + public Chunk(ChunkDescriptor descriptor, List buffers, OffsetIndex offsetIndex) { this.descriptor = descriptor; this.stream = ByteBufferInputStream.wrap(buffers); + this.offsetIndex = offsetIndex; } protected PageHeader readPageHeader() throws IOException { @@ -989,7 +1207,8 @@ public ColumnChunkPageReader readAllPages() throws IOException { PrimitiveType type = getFileMetaData().getSchema() .getType(descriptor.col.getPath()).asPrimitiveType(); long valuesCountReadSoFar = 0; - while (valuesCountReadSoFar < descriptor.metadata.getValueCount()) { + int dataPageCountReadSoFar = 0; + while (hasMorePages(valuesCountReadSoFar, dataPageCountReadSoFar)) { PageHeader pageHeader = readPageHeader(); int uncompressedPageSize = pageHeader.getUncompressed_page_size(); int compressedPageSize = pageHeader.getCompressed_page_size(); @@ -999,8 +1218,8 @@ public ColumnChunkPageReader readAllPages() throws IOException { if (dictionaryPage != null) { throw new ParquetDecodingException("more than one dictionary page in column " + descriptor.col); } - DictionaryPageHeader dicHeader = pageHeader.getDictionary_page_header(); - dictionaryPage = + DictionaryPageHeader dicHeader = pageHeader.getDictionary_page_header(); + dictionaryPage = new DictionaryPage( this.readAsBytesInput(compressedPageSize), uncompressedPageSize, @@ -1024,6 +1243,7 @@ public ColumnChunkPageReader readAllPages() throws IOException { converter.getEncoding(dataHeaderV1.getEncoding()) )); valuesCountReadSoFar += dataHeaderV1.getNum_values(); + ++dataPageCountReadSoFar; break; case DATA_PAGE_V2: DataPageHeaderV2 dataHeaderV2 = pageHeader.getData_page_header_v2(); @@ -1045,6 +1265,7 @@ public ColumnChunkPageReader readAllPages() throws IOException { dataHeaderV2.isIs_compressed() )); valuesCountReadSoFar += dataHeaderV2.getNum_values(); + ++dataPageCountReadSoFar; break; default: LOG.debug("skipping page of type {} of size {}", pageHeader.getType(), compressedPageSize); @@ -1052,7 +1273,7 @@ public ColumnChunkPageReader readAllPages() throws IOException { break; } } - if (valuesCountReadSoFar != descriptor.metadata.getValueCount()) { + if (offsetIndex == null && valuesCountReadSoFar != descriptor.metadata.getValueCount()) { // Would be nice to have a CorruptParquetFileException or something as a subclass? throw new IOException( "Expected " + descriptor.metadata.getValueCount() + " values in column chunk at " + @@ -1061,7 +1282,13 @@ public ColumnChunkPageReader readAllPages() throws IOException { + " pages ending at file offset " + (descriptor.fileOffset + stream.position())); } BytesInputDecompressor decompressor = options.getCodecFactory().getDecompressor(descriptor.metadata.getCodec()); - return new ColumnChunkPageReader(decompressor, pagesInChunk, dictionaryPage); + return new ColumnChunkPageReader(decompressor, pagesInChunk, dictionaryPage, offsetIndex, + blocks.get(currentBlock).getRowCount()); + } + + private boolean hasMorePages(long valuesCountReadSoFar, int dataPageCountReadSoFar) { + return offsetIndex == null ? valuesCountReadSoFar < descriptor.metadata.getValueCount() + : dataPageCountReadSoFar < offsetIndex.getPageCount(); } /** @@ -1086,8 +1313,8 @@ private class WorkaroundChunk extends Chunk { * @param descriptor the descriptor of the chunk * @param f the file stream positioned at the end of this chunk */ - private WorkaroundChunk(ChunkDescriptor descriptor, List buffers, SeekableInputStream f) { - super(descriptor, buffers); + private WorkaroundChunk(ChunkDescriptor descriptor, List buffers, SeekableInputStream f, OffsetIndex offsetIndex) { + super(descriptor, buffers, offsetIndex); this.f = f; } @@ -1136,7 +1363,7 @@ public BytesInput readAsBytesInput(int size) throws IOException { /** - * information needed to read a column chunk + * Information needed to read a column chunk or a part of it. */ private static class ChunkDescriptor { @@ -1162,12 +1389,29 @@ private ChunkDescriptor( this.fileOffset = fileOffset; this.size = size; } + + @Override + public int hashCode() { + return col.hashCode(); + } + + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } else if (obj instanceof ChunkDescriptor) { + return col.equals(((ChunkDescriptor) obj).col); + } else { + return false; + } + } } /** - * describes a list of consecutive column chunks to be read at once. + * Describes a list of consecutive parts to be read at once. A consecutive part may contain whole column chunks or + * only parts of them (some pages). */ - private class ConsecutiveChunkList { + private class ConsecutivePartList { private final long offset; private int length; @@ -1176,7 +1420,7 @@ private class ConsecutiveChunkList { /** * @param offset where the first chunk starts */ - ConsecutiveChunkList(long offset) { + ConsecutivePartList(long offset) { this.offset = offset; } @@ -1192,45 +1436,19 @@ public void addChunk(ChunkDescriptor descriptor) { /** * @param f file to read the chunks from - * @return the chunks + * @param builder used to build chunk list to read the pages for the different columns * @throws IOException if there is an error while reading from the stream */ - public List readAll(SeekableInputStream f) throws IOException { - List result = new ArrayList(chunks.size()); - f.seek(offset); - - int fullAllocations = length / options.getMaxAllocationSize(); - int lastAllocationSize = length % options.getMaxAllocationSize(); - - int numAllocations = fullAllocations + (lastAllocationSize > 0 ? 1 : 0); - List buffers = new ArrayList<>(numAllocations); - - for (int i = 0; i < fullAllocations; i += 1) { - buffers.add(options.getAllocator().allocate(options.getMaxAllocationSize())); - } - - if (lastAllocationSize > 0) { - buffers.add(options.getAllocator().allocate(lastAllocationSize)); - } - - for (ByteBuffer buffer : buffers) { - f.readFully(buffer); - buffer.flip(); - } + public void readAll(SeekableInputStream f, ChunkListBuilder builder) throws IOException { + List buffers = readBlocks(f, offset, length); // report in a counter the data we just scanned BenchmarkCounter.incrementBytesRead(length); ByteBufferInputStream stream = ByteBufferInputStream.wrap(buffers); for (int i = 0; i < chunks.size(); i++) { ChunkDescriptor descriptor = chunks.get(i); - if (i < chunks.size() - 1) { - result.add(new Chunk(descriptor, stream.sliceBuffers(descriptor.size))); - } else { - // because of a bug, the last chunk might be larger than descriptor.size - result.add(new WorkaroundChunk(descriptor, stream.sliceBuffers(descriptor.size), f)); - } + builder.add(descriptor, stream.sliceBuffers(descriptor.size), f); } - return result ; } /** @@ -1242,4 +1460,72 @@ public long endPos() { } + /** + * @param f file to read the blocks from + * @return the ByteBuffer blocks + * @throws IOException if there is an error while reading from the stream + */ + List readBlocks(SeekableInputStream f, long offset, int length) throws IOException { + f.seek(offset); + + int fullAllocations = length / options.getMaxAllocationSize(); + int lastAllocationSize = length % options.getMaxAllocationSize(); + + int numAllocations = fullAllocations + (lastAllocationSize > 0 ? 1 : 0); + List buffers = new ArrayList<>(numAllocations); + + for (int i = 0; i < fullAllocations; i++) { + buffers.add(options.getAllocator().allocate(options.getMaxAllocationSize())); + } + + if (lastAllocationSize > 0) { + buffers.add(options.getAllocator().allocate(lastAllocationSize)); + } + + for (ByteBuffer buffer : buffers) { + f.readFully(buffer); + buffer.flip(); + } + return buffers; + } + + Optional readColumnInBlock(int blockIndex, ColumnDescriptor columnDescriptor) { + BlockMetaData block = blocks.get(blockIndex); + if (block.getRowCount() == 0) { + throw new RuntimeException("Illegal row group of 0 rows"); + } + Optional mc = findColumnByPath(block, columnDescriptor.getPath()); + + return mc.map(column -> new ChunkDescriptor(columnDescriptor, column, column.getStartingPos(), (int) column.getTotalSize())) + .map(chunk -> readChunk(f, chunk)); + } + + private ColumnChunkPageReader readChunk(SeekableInputStream f, ChunkDescriptor descriptor) { + try { + List buffers = readBlocks(f, descriptor.fileOffset, descriptor.size); + ByteBufferInputStream stream = ByteBufferInputStream.wrap(buffers); + Chunk chunk = new WorkaroundChunk(descriptor, stream.sliceBuffers(descriptor.size), f, null); + return chunk.readAllPages(); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + private Optional findColumnByPath(BlockMetaData block, String[] path) { + for (ColumnChunkMetaData column : block.getColumns()) { + if (Arrays.equals(column.getPath().toArray(), path)) { + return Optional.of(column); + } + } + return Optional.empty(); + } + + public int blocksCount() { + return blocks.size(); + } + + public BlockMetaData getBlockMetaData(int blockIndex) { + return blocks.get(blockIndex); + } + } diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java index 7c52b1b93f..0a13e543e4 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java @@ -1,4 +1,4 @@ -/* +/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -6,9 +6,9 @@ * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -19,18 +19,22 @@ package org.apache.parquet.hadoop; import static org.apache.parquet.format.Util.writeFileMetaData; +import static org.apache.parquet.format.converter.ParquetMetadataConverter.MAX_STATS_SIZE; import static org.apache.parquet.hadoop.ParquetWriter.DEFAULT_BLOCK_SIZE; import static org.apache.parquet.hadoop.ParquetWriter.MAX_PADDING_SIZE_DEFAULT; import java.io.IOException; import java.nio.charset.Charset; import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Map.Entry; +import java.util.Optional; import java.util.Set; import org.apache.hadoop.conf.Configuration; @@ -41,16 +45,27 @@ import org.apache.parquet.Preconditions; import org.apache.parquet.Strings; import org.apache.parquet.Version; +import org.apache.parquet.bytes.ByteBufferAllocator; import org.apache.parquet.bytes.BytesInput; import org.apache.parquet.bytes.BytesUtils; +import org.apache.parquet.bytes.HeapByteBufferAllocator; import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.column.ColumnReader; +import org.apache.parquet.column.ColumnWriteStore; +import org.apache.parquet.column.ColumnWriter; import org.apache.parquet.column.Encoding; import org.apache.parquet.column.EncodingStats; +import org.apache.parquet.column.ParquetProperties; +import org.apache.parquet.column.impl.ColumnReadStoreImpl; +import org.apache.parquet.column.impl.ColumnWriteStoreV1; import org.apache.parquet.column.page.DictionaryPage; +import org.apache.parquet.column.page.PageReader; import org.apache.parquet.column.statistics.Statistics; import org.apache.parquet.column.values.bloomfilter.BloomFilter; +import org.apache.parquet.example.DummyRecordConverter; import org.apache.parquet.hadoop.ParquetOutputFormat.JobSummaryLevel; import org.apache.parquet.hadoop.metadata.ColumnPath; +import org.apache.parquet.format.Util; import org.apache.parquet.format.converter.ParquetMetadataConverter; import org.apache.parquet.hadoop.metadata.BlockMetaData; import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; @@ -58,8 +73,14 @@ import org.apache.parquet.hadoop.metadata.FileMetaData; import org.apache.parquet.hadoop.metadata.GlobalMetaData; import org.apache.parquet.hadoop.metadata.ParquetMetadata; +import org.apache.parquet.hadoop.util.BlocksCombiner; import org.apache.parquet.hadoop.util.HadoopOutputFile; import org.apache.parquet.hadoop.util.HadoopStreams; +import org.apache.parquet.internal.column.columnindex.ColumnIndex; +import org.apache.parquet.internal.column.columnindex.ColumnIndexBuilder; +import org.apache.parquet.internal.column.columnindex.OffsetIndex; +import org.apache.parquet.internal.column.columnindex.OffsetIndexBuilder; +import org.apache.parquet.internal.hadoop.metadata.IndexReference; import org.apache.parquet.io.InputFile; import org.apache.parquet.io.OutputFile; import org.apache.parquet.io.SeekableInputStream; @@ -94,13 +115,22 @@ public static enum Mode { private final MessageType schema; private final PositionOutputStream out; private final AlignmentStrategy alignment; + private final int columnIndexTruncateLength; // file data private List blocks = new ArrayList(); + // The column/offset indexes per blocks per column chunks + private final List> columnIndexes = new ArrayList<>(); + private final List> offsetIndexes = new ArrayList<>(); + // row group data private BlockMetaData currentBlock; // appended to by endColumn + // The column/offset indexes for the actual block + private List currentColumnIndexes; + private List currentOffsetIndexes; + // row group data set at the start of a row group private long currentRecordCount; // set in startBlock @@ -110,6 +140,9 @@ public static enum Mode { private long uncompressedLength; private long compressedLength; private Statistics currentStatistics; // accumulated in writePage(s) + private ColumnIndexBuilder columnIndexBuilder; + private OffsetIndexBuilder offsetIndexBuilder; + private long firstPageOffset; // column chunk data set at the start of a column private CompressionCodecName currentChunkCodec; // set in startColumn @@ -228,10 +261,27 @@ public ParquetFileWriter(Configuration configuration, MessageType schema, * @param rowGroupSize the row group size * @param maxPaddingSize the maximum padding * @throws IOException if the file can not be created + * @deprecated will be removed in 2.0.0 */ + @Deprecated public ParquetFileWriter(OutputFile file, MessageType schema, Mode mode, long rowGroupSize, int maxPaddingSize) throws IOException { + this(file, schema, mode, rowGroupSize, maxPaddingSize, + ParquetProperties.DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH); + } + /** + * @param file OutputFile to create or overwrite + * @param schema the schema of the data + * @param mode file creation mode + * @param rowGroupSize the row group size + * @param maxPaddingSize the maximum padding + * @param columnIndexTruncateLength the length which the min/max values in column indexes tried to be truncated to + * @throws IOException if the file can not be created + */ + public ParquetFileWriter(OutputFile file, MessageType schema, Mode mode, + long rowGroupSize, int maxPaddingSize, int columnIndexTruncateLength) + throws IOException { TypeUtil.checkValidWriteSchema(schema); this.schema = schema; @@ -251,6 +301,7 @@ public ParquetFileWriter(OutputFile file, MessageType schema, Mode mode, } this.encodingStatsBuilder = new EncodingStats.Builder(); + this.columnIndexTruncateLength = columnIndexTruncateLength; } /** @@ -273,6 +324,8 @@ public ParquetFileWriter(OutputFile file, MessageType schema, Mode mode, this.out = HadoopStreams.wrap( fs.create(file, true, 8192, fs.getDefaultReplication(file), rowAndBlockSize)); this.encodingStatsBuilder = new EncodingStats.Builder(); + // no truncation is needed for testing + this.columnIndexTruncateLength = Integer.MAX_VALUE; } /** * start the file @@ -298,6 +351,9 @@ public void startBlock(long recordCount) throws IOException { currentBlock = new BlockMetaData(); currentRecordCount = recordCount; + + currentColumnIndexes = new ArrayList<>(); + currentOffsetIndexes = new ArrayList<>(); } /** @@ -322,6 +378,10 @@ public void startColumn(ColumnDescriptor descriptor, uncompressedLength = 0; // The statistics will be copied from the first one added at writeDataPage(s) so we have the correct typed one currentStatistics = null; + + columnIndexBuilder = ColumnIndexBuilder.getBuilder(currentChunkType, columnIndexTruncateLength); + offsetIndexBuilder = OffsetIndexBuilder.getBuilder(); + firstPageOffset = -1; } /** @@ -379,6 +439,9 @@ public void writeDataPage( Encoding dlEncoding, Encoding valuesEncoding) throws IOException { state = state.write(); + // We are unable to build indexes without rowCount so skip them for this column + offsetIndexBuilder = OffsetIndexBuilder.getNoOpBuilder(); + columnIndexBuilder = ColumnIndexBuilder.getNoOpBuilder(); long beforeHeader = out.getPos(); LOG.debug("{}: write data page: {} values", beforeHeader, valueCount); int compressedPageSize = (int)bytes.size(); @@ -410,8 +473,50 @@ public void writeDataPage( * @param dlEncoding encoding of the definition level * @param valuesEncoding encoding of values * @throws IOException if there is an error while writing + * @deprecated this method does not support writing column indexes; Use + * {@link #writeDataPage(int, int, BytesInput, Statistics, long, Encoding, Encoding, Encoding)} instead + */ + @Deprecated + public void writeDataPage( + int valueCount, int uncompressedPageSize, + BytesInput bytes, + Statistics statistics, + Encoding rlEncoding, + Encoding dlEncoding, + Encoding valuesEncoding) throws IOException { + // We are unable to build indexes without rowCount so skip them for this column + offsetIndexBuilder = OffsetIndexBuilder.getNoOpBuilder(); + columnIndexBuilder = ColumnIndexBuilder.getNoOpBuilder(); + innerWriteDataPage(valueCount, uncompressedPageSize, bytes, statistics, rlEncoding, dlEncoding, valuesEncoding); + } + + /** + * Writes a single page + * @param valueCount count of values + * @param uncompressedPageSize the size of the data once uncompressed + * @param bytes the compressed data for the page without header + * @param statistics the statistics of the page + * @param rowCount the number of rows in the page + * @param rlEncoding encoding of the repetition level + * @param dlEncoding encoding of the definition level + * @param valuesEncoding encoding of values + * @throws IOException if any I/O error occurs during writing the file */ public void writeDataPage( + int valueCount, int uncompressedPageSize, + BytesInput bytes, + Statistics statistics, + long rowCount, + Encoding rlEncoding, + Encoding dlEncoding, + Encoding valuesEncoding) throws IOException { + long beforeHeader = out.getPos(); + innerWriteDataPage(valueCount, uncompressedPageSize, bytes, statistics, rlEncoding, dlEncoding, valuesEncoding); + + offsetIndexBuilder.add((int) (out.getPos() - beforeHeader), rowCount); + } + + private void innerWriteDataPage( int valueCount, int uncompressedPageSize, BytesInput bytes, Statistics statistics, @@ -420,8 +525,11 @@ public void writeDataPage( Encoding valuesEncoding) throws IOException { state = state.write(); long beforeHeader = out.getPos(); + if (firstPageOffset == -1) { + firstPageOffset = beforeHeader; + } LOG.debug("{}: write data page: {} values", beforeHeader, valueCount); - int compressedPageSize = (int)bytes.size(); + int compressedPageSize = (int) bytes.size(); metadataConverter.writeDataPageHeader( uncompressedPageSize, compressedPageSize, valueCount, @@ -443,6 +551,8 @@ public void writeDataPage( currentStatistics.mergeStatistics(statistics); } + columnIndexBuilder.add(statistics); + encodingStatsBuilder.addDataEncoding(valuesEncoding); currentEncodings.add(rlEncoding); currentEncodings.add(dlEncoding); @@ -450,25 +560,47 @@ public void writeDataPage( } /** - * writes a number of pages at once - * @param bytes bytes to be written including page headers + * Writes a column chunk at once + * @param descriptor the descriptor of the column + * @param valueCount the value count in this column + * @param compressionCodecName the name of the compression codec used for compressing the pages + * @param dictionaryPage the dictionary page for this column chunk (might be null) + * @param bytes the encoded pages including page headers to be written as is * @param uncompressedTotalPageSize total uncompressed size (without page headers) * @param compressedTotalPageSize total compressed size (without page headers) + * @param totalStats accumulated statistics for the column chunk + * @param columnIndexBuilder the builder object for the column index + * @param offsetIndexBuilder the builder object for the offset index + * @param rlEncodings the RL encodings used in this column chunk + * @param dlEncodings the DL encodings used in this column chunk + * @param dataEncodings the data encodings used in this column chunk * @throws IOException if there is an error while writing */ - void writeDataPages(BytesInput bytes, - long uncompressedTotalPageSize, - long compressedTotalPageSize, - Statistics totalStats, - Set rlEncodings, - Set dlEncodings, - List dataEncodings) throws IOException { + void writeColumnChunk(ColumnDescriptor descriptor, + long valueCount, + CompressionCodecName compressionCodecName, + DictionaryPage dictionaryPage, + BytesInput bytes, + long uncompressedTotalPageSize, + long compressedTotalPageSize, + Statistics totalStats, + ColumnIndexBuilder columnIndexBuilder, + OffsetIndexBuilder offsetIndexBuilder, + Set rlEncodings, + Set dlEncodings, + List dataEncodings) throws IOException { + startColumn(descriptor, valueCount, compressionCodecName); + state = state.write(); + if (dictionaryPage != null) { + writeDictionaryPage(dictionaryPage); + } LOG.debug("{}: write data pages", out.getPos()); long headersSize = bytes.size() - compressedTotalPageSize; this.uncompressedLength += uncompressedTotalPageSize + headersSize; this.compressedLength += compressedTotalPageSize + headersSize; LOG.debug("{}: write data pages content", out.getPos()); + firstPageOffset = out.getPos(); bytes.writeAllTo(out); encodingStatsBuilder.addDataEncodings(dataEncodings); if (rlEncodings.isEmpty()) { @@ -478,6 +610,11 @@ void writeDataPages(BytesInput bytes, currentEncodings.addAll(dlEncodings); currentEncodings.addAll(dataEncodings); currentStatistics = totalStats; + + this.columnIndexBuilder = columnIndexBuilder; + this.offsetIndexBuilder = offsetIndexBuilder; + + endColumn(); } /** @@ -487,6 +624,12 @@ void writeDataPages(BytesInput bytes, public void endColumn() throws IOException { state = state.endColumn(); LOG.debug("{}: end column", out.getPos()); + if (columnIndexBuilder.getMinMaxSize() > columnIndexBuilder.getPageCount() * MAX_STATS_SIZE) { + currentColumnIndexes.add(null); + } else { + currentColumnIndexes.add(columnIndexBuilder.build()); + } + currentOffsetIndexes.add(offsetIndexBuilder.build(firstPageOffset)); currentBlock.addColumn(ColumnChunkMetaData.get( currentChunkPath, currentChunkType, @@ -503,6 +646,8 @@ public void endColumn() throws IOException { this.currentBlock.setTotalByteSize(currentBlock.getTotalByteSize() + uncompressedLength); this.uncompressedLength = 0; this.compressedLength = 0; + columnIndexBuilder = null; + offsetIndexBuilder = null; } /** @@ -514,6 +659,10 @@ public void endBlock() throws IOException { LOG.debug("{}: end block", out.getPos()); currentBlock.setRowCount(currentRecordCount); blocks.add(currentBlock); + columnIndexes.add(currentColumnIndexes); + offsetIndexes.add(currentOffsetIndexes); + currentColumnIndexes = null; + currentOffsetIndexes = null; currentBlock = null; } @@ -532,6 +681,116 @@ public void appendFile(InputFile file) throws IOException { ParquetFileReader.open(file).appendTo(this); } + public int merge(List inputFiles, CodecFactory.BytesCompressor compressor, String createdBy, long maxBlockSize) throws IOException { + List readers = getReaders(inputFiles); + try { + ByteBufferAllocator allocator = new HeapByteBufferAllocator(); + ColumnReadStoreImpl columnReadStore = new ColumnReadStoreImpl(null, new DummyRecordConverter(schema).getRootConverter(), schema, createdBy); + this.start(); + List largeBlocks = BlocksCombiner.combineLargeBlocks(readers, maxBlockSize); + for (BlocksCombiner.SmallBlocksUnion smallBlocks : largeBlocks) { + for (int columnIndex = 0; columnIndex < schema.getColumns().size(); columnIndex++) { + ColumnDescriptor path = schema.getColumns().get(columnIndex); + ColumnChunkPageWriteStore store = new ColumnChunkPageWriteStore(compressor, schema, allocator, ParquetProperties.DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH); + ColumnWriteStoreV1 columnWriteStoreV1 = new ColumnWriteStoreV1(schema, store, ParquetProperties.builder().build()); + for (BlocksCombiner.SmallBlock smallBlock : smallBlocks.getBlocks()) { + ParquetFileReader parquetFileReader = smallBlock.getReader(); + try { + Optional columnChunkPageReader = parquetFileReader.readColumnInBlock(smallBlock.getBlockIndex(), path); + ColumnWriter columnWriter = columnWriteStoreV1.getColumnWriter(path); + if (columnChunkPageReader.isPresent()) { + ColumnReader columnReader = columnReadStore.newMemColumnReader(path, columnChunkPageReader.get()); + for (int i = 0; i < columnReader.getTotalValueCount(); i++) { + consumeTriplet(columnWriteStoreV1, columnWriter, columnReader); + } + } else { + MessageType inputFileSchema = parquetFileReader.getFileMetaData().getSchema(); + String[] parentPath = getExisingParentPath(path, inputFileSchema); + int def = parquetFileReader.getFileMetaData().getSchema().getMaxDefinitionLevel(parentPath); + int rep = parquetFileReader.getFileMetaData().getSchema().getMaxRepetitionLevel(parentPath); + for (int i = 0; i < parquetFileReader.getBlockMetaData(smallBlock.getBlockIndex()).getRowCount(); i++) { + columnWriter.writeNull(rep, def); + if (def == 0) { + // V1 pages also respect record boundaries so we have to mark them + columnWriteStoreV1.endRecord(); + } + } + } + } catch (Exception e) { + LOG.error("File {} is not readable", parquetFileReader.getFile(), e); + } + } + if (columnIndex == 0) { + this.startBlock(smallBlocks.getRowCount()); + } + columnWriteStoreV1.flush(); + store.flushToFileWriter(path, this); + } + this.endBlock(); + } + this.end(Collections.emptyMap()); + }finally { + BlocksCombiner.closeReaders(readers); + } + return 0; + } + + private String[] getExisingParentPath(ColumnDescriptor path, MessageType inputFileSchema) { + List parentPath = Arrays.asList(path.getPath()); + while (parentPath.size() > 0 && !inputFileSchema.containsPath(parentPath.toArray(new String[parentPath.size()]))) { + parentPath = parentPath.subList(0, parentPath.size() - 1); + } + return parentPath.toArray(new String[parentPath.size()]); + } + + private List getReaders(List inputFiles) throws IOException { + List readers = new ArrayList<>(inputFiles.size()); + for (InputFile inputFile : inputFiles) { + readers.add(ParquetFileReader.open(inputFile)); + } + return readers; + } + + private void consumeTriplet(ColumnWriteStore columnWriteStore, ColumnWriter columnWriter, ColumnReader columnReader) { + int definitionLevel = columnReader.getCurrentDefinitionLevel(); + int repetitionLevel = columnReader.getCurrentRepetitionLevel(); + ColumnDescriptor column = columnReader.getDescriptor(); + PrimitiveType type = column.getPrimitiveType(); + if (definitionLevel < column.getMaxDefinitionLevel()) { + columnWriter.writeNull(repetitionLevel, definitionLevel); + } else { + switch (type.getPrimitiveTypeName()) { + case INT32: + columnWriter.write(columnReader.getInteger(), repetitionLevel, definitionLevel); + break; + case INT64: + columnWriter.write(columnReader.getLong(), repetitionLevel, definitionLevel); + break; + case BINARY: + case FIXED_LEN_BYTE_ARRAY: + case INT96: + columnWriter.write(columnReader.getBinary(), repetitionLevel, definitionLevel); + break; + case BOOLEAN: + columnWriter.write(columnReader.getBoolean(), repetitionLevel, definitionLevel); + break; + case FLOAT: + columnWriter.write(columnReader.getFloat(), repetitionLevel, definitionLevel); + break; + case DOUBLE: + columnWriter.write(columnReader.getDouble(), repetitionLevel, definitionLevel); + break; + default: + throw new IllegalArgumentException("Unknown primitive type " + type); + } + } + columnReader.consume(); + if (repetitionLevel == 0) { + // V1 pages also respect record boundaries so we have to mark them + columnWriteStore.endRecord(); + } + } + /** * @param file a file stream to read from * @param rowGroups row groups to copy @@ -626,6 +885,11 @@ public void appendRowGroup(SeekableInputStream from, BlockMetaData rowGroup, length = 0; } + // TODO: column/offset indexes are not copied + // (it would require seeking to the end of the file for each row groups) + currentColumnIndexes.add(null); + currentOffsetIndexes.add(null); + currentBlock.addColumn(ColumnChunkMetaData.get( chunk.getPath(), chunk.getPrimitiveType(), @@ -693,12 +957,57 @@ private static void copy(SeekableInputStream from, PositionOutputStream to, */ public void end(Map extraMetaData) throws IOException { state = state.end(); + serializeColumnIndexes(columnIndexes, blocks, out); + serializeOffsetIndexes(offsetIndexes, blocks, out); LOG.debug("{}: end", out.getPos()); this.footer = new ParquetMetadata(new FileMetaData(schema, extraMetaData, Version.FULL_VERSION), blocks); serializeFooter(footer, out); out.close(); } + private static void serializeColumnIndexes( + List> columnIndexes, + List blocks, + PositionOutputStream out) throws IOException { + LOG.debug("{}: column indexes", out.getPos()); + for (int bIndex = 0, bSize = blocks.size(); bIndex < bSize; ++bIndex) { + List columns = blocks.get(bIndex).getColumns(); + List blockColumnIndexes = columnIndexes.get(bIndex); + for (int cIndex = 0, cSize = columns.size(); cIndex < cSize; ++cIndex) { + ColumnChunkMetaData column = columns.get(cIndex); + org.apache.parquet.format.ColumnIndex columnIndex = ParquetMetadataConverter + .toParquetColumnIndex(column.getPrimitiveType(), blockColumnIndexes.get(cIndex)); + if (columnIndex == null) { + continue; + } + long offset = out.getPos(); + Util.writeColumnIndex(columnIndex, out); + column.setColumnIndexReference(new IndexReference(offset, (int) (out.getPos() - offset))); + } + } + } + + private static void serializeOffsetIndexes( + List> offsetIndexes, + List blocks, + PositionOutputStream out) throws IOException { + LOG.debug("{}: offset indexes", out.getPos()); + for (int bIndex = 0, bSize = blocks.size(); bIndex < bSize; ++bIndex) { + List columns = blocks.get(bIndex).getColumns(); + List blockOffsetIndexes = offsetIndexes.get(bIndex); + for (int cIndex = 0, cSize = columns.size(); cIndex < cSize; ++cIndex) { + OffsetIndex offsetIndex = blockOffsetIndexes.get(cIndex); + if (offsetIndex == null) { + continue; + } + ColumnChunkMetaData column = columns.get(cIndex); + long offset = out.getPos(); + Util.writeOffsetIndex(ParquetMetadataConverter.toParquetOffsetIndex(offsetIndex), out); + column.setOffsetIndexReference(new IndexReference(offset, (int) (out.getPos() - offset))); + } + } + } + private static void serializeFooter(ParquetMetadata footer, PositionOutputStream out) throws IOException { long footerIndex = out.getPos(); org.apache.parquet.format.FileMetaData parquetMetadata = metadataConverter.toParquetMetadata(CURRENT_VERSION, footer); diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetInputFormat.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetInputFormat.java index 3348ed8eb2..4d6f42c2b8 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetInputFormat.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetInputFormat.java @@ -1,4 +1,4 @@ -/* +/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -6,9 +6,9 @@ * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -129,6 +129,11 @@ public class ParquetInputFormat extends FileInputFormat { */ public static final String DICTIONARY_FILTERING_ENABLED = "parquet.filter.dictionary.enabled"; + /** + * key to configure whether column index filtering of pages is enabled + */ + public static final String COLUMN_INDEX_FILTERING_ENABLED = "parquet.filter.columnindex.enabled"; + /** * key to configure whether row group bloom filtering is enabled */ diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetOutputFormat.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetOutputFormat.java index ffcf5c6a32..0789bf50d4 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetOutputFormat.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetOutputFormat.java @@ -1,4 +1,4 @@ -/* +/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -6,9 +6,9 @@ * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -143,9 +143,7 @@ public static enum JobSummaryLevel { public static final String MIN_ROW_COUNT_FOR_PAGE_SIZE_CHECK = "parquet.page.size.row.check.min"; public static final String MAX_ROW_COUNT_FOR_PAGE_SIZE_CHECK = "parquet.page.size.row.check.max"; public static final String ESTIMATE_PAGE_SIZE_CHECK = "parquet.page.size.check.estimate"; - public static final String BLOOM_FILTER_COLUMN_NAMES = "parquet.bloom.filter.column.names"; - public static final String BLOOM_FILTER_EXPECT_DISTINCT_NUMBERS = "parquet.bloom.filter.expected.distinct.numbers"; - public static final String ENABLE_BLOOM_FILTER = "parquet.enable.bloom.filter"; + public static final String COLUMN_INDEX_TRUNCATE_LENGTH = "parquet.columnindex.truncate.length"; public static JobSummaryLevel getJobSummaryLevel(Configuration conf) { String level = conf.get(JOB_SUMMARY_LEVEL); @@ -211,14 +209,6 @@ public static boolean getEnableDictionary(JobContext jobContext) { return getEnableDictionary(getConfiguration(jobContext)); } - public static void setBloomFilterColumnNames(Job job, String names) { - getConfiguration(job).set(BLOOM_FILTER_COLUMN_NAMES, names); - } - - public static String getBloomFilterColumnNames(JobContext jobContext) { - return getBloomFilterColumnNames(getConfiguration(jobContext)); - } - public static int getBlockSize(JobContext jobContext) { return getBlockSize(getConfiguration(jobContext)); } @@ -252,19 +242,6 @@ public static boolean getEnableDictionary(Configuration configuration) { ENABLE_DICTIONARY, ParquetProperties.DEFAULT_IS_DICTIONARY_ENABLED); } - public static String getBloomFilterColumnNames(Configuration conf) { - return conf.get(BLOOM_FILTER_COLUMN_NAMES); - } - - public static boolean getEnableBloomFilter(Configuration configuration) { - return configuration.getBoolean(ENABLE_BLOOM_FILTER, - ParquetProperties.DEFAULT_BLOOM_FILTER_ENABLED); - } - - public static String getBloomFilterExpectedDistinctNumbers(Configuration configuration) { - return configuration.get(BLOOM_FILTER_EXPECT_DISTINCT_NUMBERS); - } - public static int getMinRowCountForPageSizeCheck(Configuration configuration) { return configuration.getInt(MIN_ROW_COUNT_FOR_PAGE_SIZE_CHECK, ParquetProperties.DEFAULT_MINIMUM_RECORD_COUNT_FOR_CHECK); @@ -336,6 +313,18 @@ private static int getMaxPaddingSize(Configuration conf) { return conf.getInt(MAX_PADDING_BYTES, ParquetWriter.MAX_PADDING_SIZE_DEFAULT); } + public static void setColumnIndexTruncateLength(JobContext jobContext, int length) { + setColumnIndexTruncateLength(getConfiguration(jobContext), length); + } + + public static void setColumnIndexTruncateLength(Configuration conf, int length) { + conf.setInt(COLUMN_INDEX_TRUNCATE_LENGTH, length); + } + + private static int getColumnIndexTruncateLength(Configuration conf) { + return conf.getInt(COLUMN_INDEX_TRUNCATE_LENGTH, ParquetProperties.DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH); + } + private WriteSupport writeSupport; private ParquetOutputCommitter committer; @@ -385,13 +374,12 @@ public RecordWriter getRecordWriter(Configuration conf, Path file, Comp ParquetProperties props = ParquetProperties.builder() .withPageSize(getPageSize(conf)) .withDictionaryPageSize(getDictionaryPageSize(conf)) - .withBloomFilterEnabled(getEnableBloomFilter(conf)) - .withBloomFilterInfo(getBloomFilterColumnNames(conf), getBloomFilterExpectedDistinctNumbers(conf)) .withDictionaryEncoding(getEnableDictionary(conf)) .withWriterVersion(getWriterVersion(conf)) .estimateRowCountForPageSizeCheck(getEstimatePageSizeCheck(conf)) .withMinRowCountForPageSizeCheck(getMinRowCountForPageSizeCheck(conf)) .withMaxRowCountForPageSizeCheck(getMaxRowCountForPageSizeCheck(conf)) + .withColumnIndexTruncateLength(getColumnIndexTruncateLength(conf)) .build(); long blockSize = getLongBlockSize(conf); @@ -409,14 +397,12 @@ public RecordWriter getRecordWriter(Configuration conf, Path file, Comp LOG.info("Page size checking is: {}", (props.estimateNextSizeCheck() ? "estimated" : "constant")); LOG.info("Min row count for page size check is: {}", props.getMinRowCountForPageSizeCheck()); LOG.info("Max row count for page size check is: {}", props.getMaxRowCountForPageSizeCheck()); - LOG.info("Parquet Bloom Filter is {}", props.isBloomFilterEnabled()? "on": "off"); - LOG.info("Parquet Bloom filter column names are: {}", props.getBloomFilterExpectValues().keySet()); - LOG.info("Parquet Bloom filter column expect distinct values are: {}", props.getBloomFilterExpectValues().values()); + LOG.info("Truncate length for column indexes is: {}", props.getColumnIndexTruncateLength()); } WriteContext init = writeSupport.init(conf); ParquetFileWriter w = new ParquetFileWriter(HadoopOutputFile.fromPath(file, conf), - init.getSchema(), Mode.CREATE, blockSize, maxPaddingSize); + init.getSchema(), Mode.CREATE, blockSize, maxPaddingSize, props.getColumnIndexTruncateLength()); w.start(); float maxLoad = conf.getFloat(ParquetOutputFormat.MEMORY_POOL_RATIO, diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetReader.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetReader.java index d9b273bb94..de20808ff8 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetReader.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetReader.java @@ -270,6 +270,16 @@ public Builder useRecordFilter() { return this; } + public Builder useColumnIndexFilter(boolean useColumnIndexFilter) { + optionsBuilder.useColumnIndexFilter(useColumnIndexFilter); + return this; + } + + public Builder useColumnIndexFilter() { + optionsBuilder.useColumnIndexFilter(); + return this; + } + public Builder withFileRange(long start, long end) { optionsBuilder.withRange(start, end); return this; diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetWriter.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetWriter.java index a32df39a5d..5b0e4f82d1 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetWriter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetWriter.java @@ -278,7 +278,7 @@ public ParquetWriter(Path file, Configuration conf, WriteSupport writeSupport MessageType schema = writeContext.getSchema(); ParquetFileWriter fileWriter = new ParquetFileWriter( - file, schema, mode, rowGroupSize, maxPaddingSize); + file, schema, mode, rowGroupSize, maxPaddingSize, encodingProps.getColumnIndexTruncateLength()); fileWriter.start(); this.codecFactory = new CodecFactory(conf, encodingProps.getPageSizeThreshold()); diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ColumnChunkMetaData.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ColumnChunkMetaData.java index 9f476f6d07..c55225c176 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ColumnChunkMetaData.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ColumnChunkMetaData.java @@ -1,4 +1,4 @@ -/* +/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -6,9 +6,9 @@ * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -24,9 +24,11 @@ import org.apache.parquet.column.EncodingStats; import org.apache.parquet.column.statistics.BooleanStatistics; import org.apache.parquet.column.statistics.Statistics; +import org.apache.parquet.internal.hadoop.metadata.IndexReference; import org.apache.parquet.schema.PrimitiveType; import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; import org.apache.parquet.schema.Types; +import org.apache.yetus.audience.InterfaceAudience.Private; /** * Column meta data for a block stored in the file footer and passed in the InputSplit @@ -143,18 +145,18 @@ && positiveLongFitsInAnInt(totalUncompressedSize)) { } public static ColumnChunkMetaData get( - ColumnPath path, - PrimitiveType type, - CompressionCodecName codec, - EncodingStats encodingStats, - Set encodings, - Statistics statistics, - long firstDataPage, - long dictionaryPageOffset, - long bloomFilterDataOffset, - long valueCount, - long totalSize, - long totalUncompressedSize) { + ColumnPath path, + PrimitiveType type, + CompressionCodecName codec, + EncodingStats encodingStats, + Set encodings, + Statistics statistics, + long firstDataPage, + long dictionaryPageOffset, + long bloomFilterDataOffset, + long valueCount, + long totalSize, + long totalUncompressedSize) { // to save space we store those always positive longs in ints when they fit. if (positiveLongFitsInAnInt(firstDataPage) && positiveLongFitsInAnInt(dictionaryPageOffset) @@ -162,26 +164,26 @@ && positiveLongFitsInAnInt(valueCount) && positiveLongFitsInAnInt(totalSize) && positiveLongFitsInAnInt(totalUncompressedSize)) { return new IntColumnChunkMetaData( - path, type, codec, - encodingStats, encodings, - statistics, - firstDataPage, - dictionaryPageOffset, - bloomFilterDataOffset, - valueCount, - totalSize, - totalUncompressedSize); + path, type, codec, + encodingStats, encodings, + statistics, + firstDataPage, + dictionaryPageOffset, + bloomFilterDataOffset, + valueCount, + totalSize, + totalUncompressedSize); } else { return new LongColumnChunkMetaData( - path, type, codec, - encodingStats, encodings, - statistics, - firstDataPage, - dictionaryPageOffset, - bloomFilterDataOffset, - valueCount, - totalSize, - totalUncompressedSize); + path, type, codec, + encodingStats, encodings, + statistics, + firstDataPage, + dictionaryPageOffset, + bloomFilterDataOffset, + valueCount, + totalSize, + totalUncompressedSize); } } @@ -213,6 +215,9 @@ protected static boolean positiveLongFitsInAnInt(long value) { // we save 3 references by storing together the column properties that have few distinct values private final ColumnChunkProperties properties; + private IndexReference columnIndexReference; + private IndexReference offsetIndexReference; + protected ColumnChunkMetaData(ColumnChunkProperties columnChunkProperties) { this(null, columnChunkProperties); } @@ -229,9 +234,7 @@ public CompressionCodecName getCodec() { /** * * @return column identifier - * @deprecated will be removed in 2.0.0. Use {@link #getPrimitiveType()} instead. */ - @Deprecated public ColumnPath getPath() { return properties.getPath(); } @@ -287,6 +290,40 @@ public PrimitiveType getPrimitiveType() { */ abstract public Statistics getStatistics(); + /** + * @return the reference to the column index + */ + @Private + public IndexReference getColumnIndexReference() { + return columnIndexReference; + } + + /** + * @param indexReference + * the reference to the column index + */ + @Private + public void setColumnIndexReference(IndexReference indexReference) { + this.columnIndexReference = indexReference; + } + + /** + * @return the reference to the offset index + */ + @Private + public IndexReference getOffsetIndexReference() { + return offsetIndexReference; + } + + /** + * @param offsetIndexReference + * the reference to the offset index + */ + @Private + public void setOffsetIndexReference(IndexReference offsetIndexReference) { + this.offsetIndexReference = offsetIndexReference; + } + /** * @return all the encodings used in this column */ diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/util/BlocksCombiner.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/util/BlocksCombiner.java new file mode 100644 index 0000000000..02dadc7f54 --- /dev/null +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/util/BlocksCombiner.java @@ -0,0 +1,106 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.hadoop.util; + +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.metadata.BlockMetaData; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import static java.util.Collections.unmodifiableList; + +public class BlocksCombiner { + + private static final Logger LOG = LoggerFactory.getLogger(BlocksCombiner.class); + + public static List combineLargeBlocks(List readers, long maxBlockSize) { + List blocks = new ArrayList<>(); + long largeBlockSize = 0; + long largeBlockRecords = 0; + List smallBlocks = new ArrayList<>(); + for (ParquetFileReader reader : readers) { + for (int blockIndex = 0; blockIndex < reader.blocksCount(); blockIndex++) { + BlockMetaData block = reader.getBlockMetaData(blockIndex); + if (!smallBlocks.isEmpty() && largeBlockSize + block.getTotalByteSize() > maxBlockSize) { + blocks.add(new SmallBlocksUnion(smallBlocks, largeBlockRecords)); + smallBlocks = new ArrayList<>(); + largeBlockSize = 0; + largeBlockRecords = 0; + } + largeBlockSize += block.getTotalByteSize(); + largeBlockRecords += block.getRowCount(); + smallBlocks.add(new SmallBlock(reader, blockIndex)); + } + } + if (!smallBlocks.isEmpty()) { + blocks.add(new SmallBlocksUnion(smallBlocks, largeBlockRecords)); + } + return unmodifiableList(blocks); + } + + public static void closeReaders(List readers) { + readers.forEach(r -> { + try { + r.close(); + } catch (IOException e) { + LOG.error("Error closing reader {}", r.getFile(), e); + } + }); + } + + public static class SmallBlocksUnion { + private final List blocks; + private final long rowCount; + + public SmallBlocksUnion(List blocks, long rowCount) { + this.blocks = blocks; + this.rowCount = rowCount; + } + + public List getBlocks() { + return blocks; + } + + public long getRowCount() { + return rowCount; + } + } + + public static class SmallBlock { + private final ParquetFileReader reader; + private final int blockIndex; + + public SmallBlock(ParquetFileReader reader, int blockIndex) { + this.reader = reader; + this.blockIndex = blockIndex; + } + + public ParquetFileReader getReader() { + return reader; + } + + public int getBlockIndex() { + return blockIndex; + } + } +} diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/internal/hadoop/metadata/IndexReference.java b/parquet-hadoop/src/main/java/org/apache/parquet/internal/hadoop/metadata/IndexReference.java new file mode 100644 index 0000000000..5e02f1efec --- /dev/null +++ b/parquet-hadoop/src/main/java/org/apache/parquet/internal/hadoop/metadata/IndexReference.java @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.internal.hadoop.metadata; + +/** + * Reference to an index (OffsetIndex and ColumnIndex) for a row-group containing the offset and length values so the + * reader can read the referenced data. + */ +public class IndexReference { + private final long offset; + private final int length; + + public IndexReference(long offset, int length) { + this.offset = offset; + this.length = length; + } + + public long getOffset() { + return offset; + } + + public int getLength() { + return length; + } +} diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/filter2/recordlevel/PhoneBookWriter.java b/parquet-hadoop/src/test/java/org/apache/parquet/filter2/recordlevel/PhoneBookWriter.java index 7acda935c3..18ddca0d96 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/filter2/recordlevel/PhoneBookWriter.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/filter2/recordlevel/PhoneBookWriter.java @@ -31,6 +31,7 @@ import org.apache.parquet.filter2.compat.FilterCompat.Filter; import org.apache.parquet.hadoop.ParquetReader; import org.apache.parquet.hadoop.ParquetWriter; +import org.apache.parquet.hadoop.example.ExampleParquetWriter; import org.apache.parquet.hadoop.example.GroupReadSupport; import org.apache.parquet.hadoop.example.GroupWriteSupport; import org.apache.parquet.schema.MessageType; @@ -91,6 +92,11 @@ public int hashCode() { result = 31 * result + (lat != null ? lat.hashCode() : 0); return result; } + + @Override + public String toString() { + return "Location [lon=" + lon + ", lat=" + lat + "]"; + } } public static class PhoneNumber { @@ -129,6 +135,11 @@ public int hashCode() { result = 31 * result + (kind != null ? kind.hashCode() : 0); return result; } + + @Override + public String toString() { + return "PhoneNumber [number=" + number + ", kind=" + kind + "]"; + } } public static class User { @@ -183,6 +194,11 @@ public int hashCode() { result = 31 * result + (location != null ? location.hashCode() : 0); return result; } + + @Override + public String toString() { + return "User [id=" + id + ", name=" + name + ", phoneNumbers=" + phoneNumbers + ", location=" + location + "]"; + } } public static SimpleGroup groupFromUser(User user) { @@ -216,6 +232,56 @@ public static SimpleGroup groupFromUser(User user) { return root; } + private static User userFromGroup(Group root) { + return new User(getLong(root, "id"), getString(root, "name"), getPhoneNumbers(getGroup(root, "phoneNumbers")), + getLocation(getGroup(root, "location"))); + } + + private static List getPhoneNumbers(Group phoneNumbers) { + if (phoneNumbers == null) { + return null; + } + List list = new ArrayList<>(); + for (int i = 0, n = phoneNumbers.getFieldRepetitionCount("phone"); i < n; ++i) { + Group phone = phoneNumbers.getGroup("phone", i); + list.add(new PhoneNumber(getLong(phone, "number"), getString(phone, "kind"))); + } + return list; + } + + private static Location getLocation(Group location) { + if (location == null) { + return null; + } + return new Location(getDouble(location, "lon"), getDouble(location, "lat")); + } + + private static boolean isNull(Group group, String field) { + int repetition = group.getFieldRepetitionCount(field); + if (repetition == 0) { + return true; + } else if (repetition == 1) { + return false; + } + throw new AssertionError("Invalid repetitionCount " + repetition + " for field " + field + " in group " + group); + } + + private static Long getLong(Group group, String field) { + return isNull(group, field) ? null : group.getLong(field, 0); + } + + private static String getString(Group group, String field) { + return isNull(group, field) ? null : group.getString(field, 0); + } + + private static Double getDouble(Group group, String field) { + return isNull(group, field) ? null : group.getDouble(field, 0); + } + + private static Group getGroup(Group group, String field) { + return isNull(group, field) ? null : group.getGroup(field, 0); + } + public static File writeToFile(List users) throws IOException { File f = File.createTempFile("phonebook", ".parquet"); f.deleteOnExit(); @@ -229,25 +295,30 @@ public static File writeToFile(List users) throws IOException { } public static void writeToFile(File f, List users) throws IOException { - Configuration conf = new Configuration(); - GroupWriteSupport.setSchema(schema, conf); + write(ExampleParquetWriter.builder(new Path(f.getAbsolutePath())), users); + } - ParquetWriter writer = new ParquetWriter(new Path(f.getAbsolutePath()), conf, new GroupWriteSupport()); - for (User u : users) { - writer.write(groupFromUser(u)); + public static void write(ParquetWriter.Builder builder, List users) throws IOException { + builder.config(GroupWriteSupport.PARQUET_EXAMPLE_SCHEMA, schema.toString()); + try (ParquetWriter writer = builder.build()) { + for (User u : users) { + writer.write(groupFromUser(u)); + } } - writer.close(); } - public static List readFile(File f, Filter filter) throws IOException { + private static ParquetReader createReader(Path file, Filter filter) throws IOException { Configuration conf = new Configuration(); GroupWriteSupport.setSchema(schema, conf); - ParquetReader reader = - ParquetReader.builder(new GroupReadSupport(), new Path(f.getAbsolutePath())) - .withConf(conf) - .withFilter(filter) - .build(); + return ParquetReader.builder(new GroupReadSupport(), file) + .withConf(conf) + .withFilter(filter) + .build(); + } + + public static List readFile(File f, Filter filter) throws IOException { + ParquetReader reader = createReader(new Path(f.getAbsolutePath()), filter); Group current; List users = new ArrayList(); @@ -261,6 +332,16 @@ public static List readFile(File f, Filter filter) throws IOException { return users; } + public static List readUsers(ParquetReader.Builder builder) throws IOException { + ParquetReader reader = builder.set(GroupWriteSupport.PARQUET_EXAMPLE_SCHEMA, schema.toString()).build(); + + List users = new ArrayList<>(); + for (Group group = reader.read(); group != null; group = reader.read()) { + users.add(userFromGroup(group)); + } + return users; + } + public static void main(String[] args) throws IOException { File f = new File(args[0]); writeToFile(f, TestRecordLevelFilters.makeUsers()); diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java index d1a3a3c233..358a29a671 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java @@ -25,6 +25,7 @@ import static org.apache.parquet.schema.MessageTypeParser.parseMessageType; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNull; import static org.junit.Assert.assertSame; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; @@ -69,6 +70,11 @@ import org.apache.parquet.hadoop.metadata.ColumnPath; import org.apache.parquet.hadoop.metadata.CompressionCodecName; import org.apache.parquet.hadoop.metadata.ParquetMetadata; +import org.apache.parquet.internal.column.columnindex.BoundaryOrder; +import org.apache.parquet.internal.column.columnindex.ColumnIndex; +import org.apache.parquet.internal.column.columnindex.ColumnIndexBuilder; +import org.apache.parquet.internal.column.columnindex.OffsetIndex; +import org.apache.parquet.internal.column.columnindex.OffsetIndexBuilder; import org.apache.parquet.io.api.Binary; import org.apache.parquet.schema.PrimitiveType; import org.apache.parquet.schema.LogicalTypeAnnotation; @@ -201,6 +207,12 @@ public void testTimeLogicalTypes() { .required(PrimitiveTypeName.INT64) .as(timestampType(true, LogicalTypeAnnotation.TimeUnit.MICROS)) .named("aTimestampUtcMicros") + .required(PrimitiveTypeName.INT64) + .as(timestampType(false, LogicalTypeAnnotation.TimeUnit.NANOS)) + .named("aTimestampNonUtcNanos") + .required(PrimitiveTypeName.INT64) + .as(timestampType(true, LogicalTypeAnnotation.TimeUnit.NANOS)) + .named("aTimestampUtcNanos") .required(PrimitiveTypeName.INT32) .as(timeType(false, LogicalTypeAnnotation.TimeUnit.MILLIS)) .named("aTimeNonUtcMillis") @@ -213,6 +225,12 @@ public void testTimeLogicalTypes() { .required(PrimitiveTypeName.INT64) .as(timeType(true, LogicalTypeAnnotation.TimeUnit.MICROS)) .named("aTimeUtcMicros") + .required(PrimitiveTypeName.INT64) + .as(timeType(false, LogicalTypeAnnotation.TimeUnit.NANOS)) + .named("aTimeNonUtcNanos") + .required(PrimitiveTypeName.INT64) + .as(timeType(true, LogicalTypeAnnotation.TimeUnit.NANOS)) + .named("aTimeUtcNanos") .named("Message"); List parquetSchema = parquetMetadataConverter.toParquetSchema(expected); MessageType schema = parquetMetadataConverter.fromParquetSchema(parquetSchema, null); @@ -973,4 +991,60 @@ public void testColumnOrders() throws IOException { assertEquals(ColumnOrder.undefined(), columns.get(1).getPrimitiveType().columnOrder()); assertEquals(ColumnOrder.undefined(), columns.get(2).getPrimitiveType().columnOrder()); } + + @Test + public void testOffsetIndexConversion() { + OffsetIndexBuilder builder = OffsetIndexBuilder.getBuilder(); + builder.add(1000, 10000, 0); + builder.add(22000, 12000, 100); + OffsetIndex offsetIndex = ParquetMetadataConverter + .fromParquetOffsetIndex(ParquetMetadataConverter.toParquetOffsetIndex(builder.build(100000))); + assertEquals(2, offsetIndex.getPageCount()); + assertEquals(101000, offsetIndex.getOffset(0)); + assertEquals(10000, offsetIndex.getCompressedPageSize(0)); + assertEquals(0, offsetIndex.getFirstRowIndex(0)); + assertEquals(122000, offsetIndex.getOffset(1)); + assertEquals(12000, offsetIndex.getCompressedPageSize(1)); + assertEquals(100, offsetIndex.getFirstRowIndex(1)); + } + + @Test + public void testColumnIndexConversion() { + PrimitiveType type = Types.required(PrimitiveTypeName.INT64).named("test_int64"); + ColumnIndexBuilder builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE); + Statistics stats = Statistics.createStats(type); + stats.incrementNumNulls(16); + stats.updateStats(-100l); + stats.updateStats(100l); + builder.add(stats); + stats = Statistics.createStats(type); + stats.incrementNumNulls(111); + builder.add(stats); + stats = Statistics.createStats(type); + stats.updateStats(200l); + stats.updateStats(500l); + builder.add(stats); + org.apache.parquet.format.ColumnIndex parquetColumnIndex = + ParquetMetadataConverter.toParquetColumnIndex(type, builder.build()); + ColumnIndex columnIndex = ParquetMetadataConverter.fromParquetColumnIndex(type, parquetColumnIndex); + assertEquals(BoundaryOrder.ASCENDING, columnIndex.getBoundaryOrder()); + assertTrue(Arrays.asList(false, true, false).equals(columnIndex.getNullPages())); + assertTrue(Arrays.asList(16l, 111l, 0l).equals(columnIndex.getNullCounts())); + assertTrue(Arrays.asList( + ByteBuffer.wrap(BytesUtils.longToBytes(-100l)), + ByteBuffer.allocate(0), + ByteBuffer.wrap(BytesUtils.longToBytes(200l))).equals(columnIndex.getMinValues())); + assertTrue(Arrays.asList( + ByteBuffer.wrap(BytesUtils.longToBytes(100l)), + ByteBuffer.allocate(0), + ByteBuffer.wrap(BytesUtils.longToBytes(500l))).equals(columnIndex.getMaxValues())); + + assertNull("Should handle null column index", ParquetMetadataConverter + .toParquetColumnIndex(Types.required(PrimitiveTypeName.INT32).named("test_int32"), null)); + assertNull("Should ignore unsupported types", ParquetMetadataConverter + .toParquetColumnIndex(Types.required(PrimitiveTypeName.INT96).named("test_int96"), columnIndex)); + assertNull("Should ignore unsupported types", + ParquetMetadataConverter.fromParquetColumnIndex(Types.required(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) + .length(12).as(OriginalType.INTERVAL).named("test_interval"), parquetColumnIndex)); + } } diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestColumnChunkPageWriteStore.java b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestColumnChunkPageWriteStore.java index a5381f073b..9a27defe15 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestColumnChunkPageWriteStore.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestColumnChunkPageWriteStore.java @@ -18,8 +18,13 @@ */ package org.apache.parquet.hadoop; +import static org.junit.Assert.assertArrayEquals; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.mockito.Matchers.any; import static org.mockito.Matchers.eq; +import static org.mockito.Matchers.isNull; +import static org.mockito.Matchers.same; import static org.mockito.Mockito.inOrder; import static org.apache.parquet.column.Encoding.PLAIN; import static org.apache.parquet.column.Encoding.RLE; @@ -51,13 +56,23 @@ import org.apache.parquet.column.ColumnDescriptor; import org.apache.parquet.column.Encoding; import org.apache.parquet.column.page.DataPageV2; +import org.apache.parquet.column.page.DictionaryPage; import org.apache.parquet.column.page.PageReadStore; import org.apache.parquet.column.page.PageReader; import org.apache.parquet.column.page.PageWriter; import org.apache.parquet.column.statistics.BinaryStatistics; import org.apache.parquet.column.statistics.Statistics; +import org.apache.parquet.hadoop.ParquetFileWriter.Mode; +import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; import org.apache.parquet.hadoop.metadata.CompressionCodecName; import org.apache.parquet.hadoop.metadata.ParquetMetadata; +import org.apache.parquet.hadoop.util.HadoopOutputFile; +import org.apache.parquet.internal.column.columnindex.ColumnIndex; +import org.apache.parquet.internal.column.columnindex.ColumnIndexBuilder; +import org.apache.parquet.internal.column.columnindex.OffsetIndex; +import org.apache.parquet.internal.column.columnindex.OffsetIndexBuilder; +import org.apache.parquet.io.OutputFile; +import org.apache.parquet.io.PositionOutputStream; import org.apache.parquet.schema.MessageType; import org.apache.parquet.schema.MessageTypeParser; import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; @@ -66,6 +81,40 @@ public class TestColumnChunkPageWriteStore { + // OutputFile implementation to expose the PositionOutputStream internally used by the writer + private static class OutputFileForTesting implements OutputFile { + private PositionOutputStream out; + private final HadoopOutputFile file; + + OutputFileForTesting(Path path, Configuration conf) throws IOException { + file = HadoopOutputFile.fromPath(path, conf); + } + + PositionOutputStream out() { + return out; + } + + @Override + public PositionOutputStream create(long blockSizeHint) throws IOException { + return out = file.create(blockSizeHint); + } + + @Override + public PositionOutputStream createOrOverwrite(long blockSizeHint) throws IOException { + return out = file.createOrOverwrite(blockSizeHint); + } + + @Override + public boolean supportsBlockSize() { + return file.supportsBlockSize(); + } + + @Override + public long defaultBlockSize() { + return file.defaultBlockSize(); + } + } + private int pageSize = 1024; private int initialSize = 1024; private Configuration conf; @@ -98,13 +147,21 @@ public void test() throws Exception { BytesInput data = BytesInput.fromInt(v); int rowCount = 5; int nullCount = 1; + statistics.incrementNumNulls(nullCount); + statistics.setMinMaxFromBytes(new byte[] {0, 1, 2}, new byte[] {0, 1, 2, 3}); + long pageOffset; + long pageSize; { - ParquetFileWriter writer = new ParquetFileWriter(conf, schema, file); + OutputFileForTesting outputFile = new OutputFileForTesting(file, conf); + ParquetFileWriter writer = new ParquetFileWriter(outputFile, schema, Mode.CREATE, + ParquetWriter.DEFAULT_BLOCK_SIZE, ParquetWriter.MAX_PADDING_SIZE_DEFAULT); writer.start(); writer.startBlock(rowCount); + pageOffset = outputFile.out().getPos(); { - ColumnChunkPageWriteStore store = new ColumnChunkPageWriteStore(compressor(GZIP), schema , new HeapByteBufferAllocator()); + ColumnChunkPageWriteStore store = new ColumnChunkPageWriteStore(compressor(GZIP), schema, + new HeapByteBufferAllocator(), Integer.MAX_VALUE); PageWriter pageWriter = store.getPageWriter(col); pageWriter.writePageV2( rowCount, nullCount, valueCount, @@ -112,6 +169,7 @@ public void test() throws Exception { dataEncoding, data, statistics); store.flushToFileWriter(writer); + pageSize = outputFile.out().getPos() - pageOffset; } writer.endBlock(); writer.end(new HashMap()); @@ -132,6 +190,20 @@ public void test() throws Exception { assertEquals(dataEncoding, page.getDataEncoding()); assertEquals(v, intValue(page.getData())); assertEquals(statistics.toString(), page.getStatistics().toString()); + + // Checking column/offset indexes for the one page + ColumnChunkMetaData column = footer.getBlocks().get(0).getColumns().get(0); + ColumnIndex columnIndex = reader.readColumnIndex(column); + assertArrayEquals(statistics.getMinBytes(), columnIndex.getMinValues().get(0).array()); + assertArrayEquals(statistics.getMaxBytes(), columnIndex.getMaxValues().get(0).array()); + assertEquals(statistics.getNumNulls(), columnIndex.getNullCounts().get(0).longValue()); + assertFalse(columnIndex.getNullPages().get(0)); + OffsetIndex offsetIndex = reader.readOffsetIndex(column); + assertEquals(1, offsetIndex.getPageCount()); + assertEquals(pageSize, offsetIndex.getCompressedPageSize(0)); + assertEquals(0, offsetIndex.getFirstRowIndex(0)); + assertEquals(pageOffset, offsetIndex.getOffset(0)); + reader.close(); } } @@ -164,7 +236,7 @@ public void testColumnOrderV1() throws IOException { // TODO - look back at this, an allocator was being passed here in the ByteBuffer changes // see comment at this constructor ColumnChunkPageWriteStore store = new ColumnChunkPageWriteStore( - compressor(UNCOMPRESSED), schema, new HeapByteBufferAllocator()); + compressor(UNCOMPRESSED), schema, new HeapByteBufferAllocator(), Integer.MAX_VALUE); for (ColumnDescriptor col : schema.getColumns()) { PageWriter pageWriter = store.getPageWriter(col); @@ -175,8 +247,20 @@ public void testColumnOrderV1() throws IOException { store.flushToFileWriter(mockFileWriter); for (ColumnDescriptor col : schema.getColumns()) { - inOrder.verify(mockFileWriter).startColumn( - eq(col), eq((long) fakeCount), eq(UNCOMPRESSED)); + inOrder.verify(mockFileWriter).writeColumnChunk( + eq(col), + eq((long) fakeCount), + eq(UNCOMPRESSED), + isNull(DictionaryPage.class), + any(), + eq(fakeData.size()), + eq(fakeData.size()), + eq(fakeStats), + same(ColumnIndexBuilder.getNoOpBuilder()), // Deprecated writePage -> no column index + same(OffsetIndexBuilder.getNoOpBuilder()), // Deprecated writePage -> no offset index + any(), + any(), + any()); } } diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestColumnIndexFiltering.java b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestColumnIndexFiltering.java new file mode 100644 index 0000000000..71155ced7b --- /dev/null +++ b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestColumnIndexFiltering.java @@ -0,0 +1,442 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.hadoop; + +import static java.util.Collections.emptyList; +import static org.apache.parquet.filter2.predicate.FilterApi.and; +import static org.apache.parquet.filter2.predicate.FilterApi.binaryColumn; +import static org.apache.parquet.filter2.predicate.FilterApi.doubleColumn; +import static org.apache.parquet.filter2.predicate.FilterApi.eq; +import static org.apache.parquet.filter2.predicate.FilterApi.gtEq; +import static org.apache.parquet.filter2.predicate.FilterApi.longColumn; +import static org.apache.parquet.filter2.predicate.FilterApi.lt; +import static org.apache.parquet.filter2.predicate.FilterApi.ltEq; +import static org.apache.parquet.filter2.predicate.FilterApi.not; +import static org.apache.parquet.filter2.predicate.FilterApi.notEq; +import static org.apache.parquet.filter2.predicate.FilterApi.or; +import static org.apache.parquet.filter2.predicate.FilterApi.userDefined; +import static org.apache.parquet.filter2.predicate.LogicalInverter.invert; +import static org.apache.parquet.hadoop.ParquetFileWriter.Mode.OVERWRITE; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +import java.io.IOException; +import java.io.Serializable; +import java.nio.file.Files; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.Comparator; +import java.util.Iterator; +import java.util.List; +import java.util.Random; +import java.util.function.Predicate; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.parquet.column.ParquetProperties.WriterVersion; +import org.apache.parquet.filter2.compat.FilterCompat; +import org.apache.parquet.filter2.compat.FilterCompat.Filter; +import org.apache.parquet.filter2.predicate.FilterPredicate; +import org.apache.parquet.filter2.predicate.Statistics; +import org.apache.parquet.filter2.predicate.UserDefinedPredicate; +import org.apache.parquet.filter2.recordlevel.PhoneBookWriter; +import org.apache.parquet.filter2.recordlevel.PhoneBookWriter.Location; +import org.apache.parquet.filter2.recordlevel.PhoneBookWriter.PhoneNumber; +import org.apache.parquet.filter2.recordlevel.PhoneBookWriter.User; +import org.apache.parquet.hadoop.example.ExampleParquetWriter; +import org.apache.parquet.hadoop.example.GroupReadSupport; +import org.apache.parquet.io.api.Binary; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.junit.runners.Parameterized.Parameters; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Unit tests for high level column index based filtering. + */ +@RunWith(Parameterized.class) +public class TestColumnIndexFiltering { + private static final Logger LOGGER = LoggerFactory.getLogger(TestColumnIndexFiltering.class); + private static final Random RANDOM = new Random(42); + private static final String[] PHONE_KINDS = { null, "mobile", "home", "work" }; + private static final List DATA = Collections.unmodifiableList(generateData(10000)); + private static final Path FILE_V1 = createTempFile(); + private static final Path FILE_V2 = createTempFile(); + + @Parameters + public static Collection params() { + return Arrays.asList(new Object[] { FILE_V1 }, new Object[] { FILE_V2 }); + } + + private final Path file; + + public TestColumnIndexFiltering(Path file) { + this.file = file; + } + + private static List generateData(int rowCount) { + List users = new ArrayList<>(); + List names = generateNames(rowCount); + for (int i = 0; i < rowCount; ++i) { + users.add(new User(i, names.get(i), generatePhoneNumbers(), generateLocation(i, rowCount))); + } + return users; + } + + private static List generateNames(int rowCount) { + List list = new ArrayList<>(); + + // Adding fix values for filtering + list.add("anderson"); + list.add("anderson"); + list.add("miller"); + list.add("miller"); + list.add("miller"); + list.add("thomas"); + list.add("thomas"); + list.add("williams"); + + int nullCount = rowCount / 100; + + String alphabet = "aabcdeefghiijklmnoopqrstuuvwxyz"; + int maxLength = 8; + for (int i = rowCount - list.size() - nullCount; i >= 0; --i) { + int l = RANDOM.nextInt(maxLength); + StringBuilder builder = new StringBuilder(l); + for (int j = 0; j < l; ++j) { + builder.append(alphabet.charAt(RANDOM.nextInt(alphabet.length()))); + } + list.add(builder.toString()); + } + Collections.sort(list, (str1, str2) -> -str1.compareTo(str2)); + + // Adding nulls to random places + for (int i = 0; i < nullCount; ++i) { + list.add(RANDOM.nextInt(list.size()), null); + } + + return list; + } + + private static List generatePhoneNumbers() { + int length = RANDOM.nextInt(5) - 1; + if (length < 0) { + return null; + } + List phoneNumbers = new ArrayList<>(length); + for (int i = 0; i < length; ++i) { + // 6 digits numbers + long number = Math.abs(RANDOM.nextLong() % 900000) + 100000; + phoneNumbers.add(new PhoneNumber(number, PHONE_KINDS[RANDOM.nextInt(PHONE_KINDS.length)])); + } + return phoneNumbers; + } + + private static Location generateLocation(int id, int rowCount) { + if (RANDOM.nextDouble() < 0.01) { + return null; + } + + double lat = RANDOM.nextDouble() * 90.0 - (id < rowCount / 2 ? 90.0 : 0.0); + double lon = RANDOM.nextDouble() * 90.0 - (id < rowCount / 4 || id >= 3 * rowCount / 4 ? 90.0 : 0.0); + + return new Location(RANDOM.nextDouble() < 0.01 ? null : lat, RANDOM.nextDouble() < 0.01 ? null : lon); + } + + private static Path createTempFile() { + try { + return new Path(Files.createTempFile("test-ci_", ".parquet").toAbsolutePath().toString()); + } catch (IOException e) { + throw new AssertionError("Unable to create temporary file", e); + } + } + + private List readUsers(FilterPredicate filter, boolean useOtherFiltering) throws IOException { + return readUsers(FilterCompat.get(filter), useOtherFiltering, true); + } + + private List readUsers(FilterPredicate filter, boolean useOtherFiltering, boolean useColumnIndexFilter) + throws IOException { + return readUsers(FilterCompat.get(filter), useOtherFiltering, useColumnIndexFilter); + } + + private List readUsers(Filter filter, boolean useOtherFiltering) throws IOException { + return readUsers(filter, useOtherFiltering, true); + } + + private List readUsers(Filter filter, boolean useOtherFiltering, boolean useColumnIndexFilter) + throws IOException { + return PhoneBookWriter.readUsers(ParquetReader.builder(new GroupReadSupport(), file) + .withFilter(filter) + .useDictionaryFilter(useOtherFiltering) + .useStatsFilter(useOtherFiltering) + .useRecordFilter(useOtherFiltering) + .useColumnIndexFilter(useColumnIndexFilter)); + } + + // Assumes that both lists are in the same order + private static void assertContains(Stream expected, List actual) { + Iterator expIt = expected.iterator(); + if (!expIt.hasNext()) { + return; + } + User exp = expIt.next(); + for (User act : actual) { + if (act.equals(exp)) { + if (!expIt.hasNext()) { + break; + } + exp = expIt.next(); + } + } + assertFalse("Not all expected elements are in the actual list. E.g.: " + exp, expIt.hasNext()); + } + + private void assertCorrectFiltering(Predicate expectedFilter, FilterPredicate actualFilter) + throws IOException { + // Check with only column index based filtering + List result = readUsers(actualFilter, false); + + assertTrue("Column-index filtering should drop some pages", result.size() < DATA.size()); + LOGGER.info("{}/{} records read; filtering ratio: {}%", result.size(), DATA.size(), + 100 * result.size() / DATA.size()); + // Asserts that all the required records are in the result + assertContains(DATA.stream().filter(expectedFilter), result); + // Asserts that all the retrieved records are in the file (validating non-matching records) + assertContains(result.stream(), DATA); + + // Check with all the filtering filtering to ensure the result contains exactly the required values + result = readUsers(actualFilter, true); + assertEquals(DATA.stream().filter(expectedFilter).collect(Collectors.toList()), result); + } + + @BeforeClass + public static void createFile() throws IOException { + int pageSize = DATA.size() / 10; // Ensure that several pages will be created + int rowGroupSize = pageSize * 6 * 5; // Ensure that there are more row-groups created + PhoneBookWriter.write(ExampleParquetWriter.builder(FILE_V1) + .withWriteMode(OVERWRITE) + .withRowGroupSize(rowGroupSize) + .withPageSize(pageSize) + .withWriterVersion(WriterVersion.PARQUET_1_0), + DATA); + PhoneBookWriter.write(ExampleParquetWriter.builder(FILE_V2) + .withWriteMode(OVERWRITE) + .withRowGroupSize(rowGroupSize) + .withPageSize(pageSize) + .withWriterVersion(WriterVersion.PARQUET_2_0), + DATA); + } + + @AfterClass + public static void deleteFile() throws IOException { + FILE_V1.getFileSystem(new Configuration()).delete(FILE_V1, false); + FILE_V2.getFileSystem(new Configuration()).delete(FILE_V2, false); + } + + @Test + public void testSimpleFiltering() throws IOException { + assertCorrectFiltering( + record -> record.getId() == 1234, + eq(longColumn("id"), 1234l)); + assertCorrectFiltering( + record -> "miller".equals(record.getName()), + eq(binaryColumn("name"), Binary.fromString("miller"))); + assertCorrectFiltering( + record -> record.getName() == null, + eq(binaryColumn("name"), null)); + } + + @Test + public void testNoFiltering() throws IOException { + // Column index filtering with no-op filter + assertEquals(DATA, readUsers(FilterCompat.NOOP, false)); + assertEquals(DATA, readUsers(FilterCompat.NOOP, true)); + + // Column index filtering turned off + assertEquals(DATA.stream().filter(user -> user.getId() == 1234).collect(Collectors.toList()), + readUsers(eq(longColumn("id"), 1234l), true, false)); + assertEquals(DATA.stream().filter(user -> "miller".equals(user.getName())).collect(Collectors.toList()), + readUsers(eq(binaryColumn("name"), Binary.fromString("miller")), true, false)); + assertEquals(DATA.stream().filter(user -> user.getName() == null).collect(Collectors.toList()), + readUsers(eq(binaryColumn("name"), null), true, false)); + + // Every filtering mechanism turned off + assertEquals(DATA, readUsers(eq(longColumn("id"), 1234l), false, false)); + assertEquals(DATA, readUsers(eq(binaryColumn("name"), Binary.fromString("miller")), false, false)); + assertEquals(DATA, readUsers(eq(binaryColumn("name"), null), false, false)); + } + + @Test + public void testComplexFiltering() throws IOException { + assertCorrectFiltering( + record -> { + Location loc = record.getLocation(); + Double lat = loc == null ? null : loc.getLat(); + Double lon = loc == null ? null : loc.getLon(); + return lat != null && lon != null && 37 <= lat && lat <= 70 && -21 <= lon && lon <= 35; + }, + and(and(gtEq(doubleColumn("location.lat"), 37.0), ltEq(doubleColumn("location.lat"), 70.0)), + and(gtEq(doubleColumn("location.lon"), -21.0), ltEq(doubleColumn("location.lon"), 35.0)))); + assertCorrectFiltering( + record -> { + Location loc = record.getLocation(); + return loc == null || (loc.getLat() == null && loc.getLon() == null); + }, + and(eq(doubleColumn("location.lat"), null), eq(doubleColumn("location.lon"), null))); + assertCorrectFiltering( + record -> { + String name = record.getName(); + return name != null && name.compareTo("thomas") < 0 && record.getId() <= 3 * DATA.size() / 4; + }, + and(lt(binaryColumn("name"), Binary.fromString("thomas")), ltEq(longColumn("id"), 3l * DATA.size() / 4))); + } + + public static class NameStartsWithVowel extends UserDefinedPredicate { + private static final Binary A = Binary.fromString("a"); + private static final Binary B = Binary.fromString("b"); + private static final Binary E = Binary.fromString("e"); + private static final Binary F = Binary.fromString("f"); + private static final Binary I = Binary.fromString("i"); + private static final Binary J = Binary.fromString("j"); + private static final Binary O = Binary.fromString("o"); + private static final Binary P = Binary.fromString("p"); + private static final Binary U = Binary.fromString("u"); + private static final Binary V = Binary.fromString("v"); + + private static boolean isStartingWithVowel(String str) { + if (str == null || str.isEmpty()) { + return false; + } + switch (str.charAt(0)) { + case 'a': + case 'e': + case 'i': + case 'o': + case 'u': + return true; + default: + return false; + } + } + + @Override + public boolean keep(Binary value) { + return value != null && isStartingWithVowel(value.toStringUsingUTF8()); + } + + @Override + public boolean canDrop(Statistics statistics) { + Comparator cmp = statistics.getComparator(); + Binary min = statistics.getMin(); + Binary max = statistics.getMax(); + return cmp.compare(max, A) < 0 + || (cmp.compare(min, B) >= 0 && cmp.compare(max, E) < 0) + || (cmp.compare(min, F) >= 0 && cmp.compare(max, I) < 0) + || (cmp.compare(min, J) >= 0 && cmp.compare(max, O) < 0) + || (cmp.compare(min, P) >= 0 && cmp.compare(max, U) < 0) + || cmp.compare(min, V) >= 0; + } + + @Override + public boolean inverseCanDrop(Statistics statistics) { + Comparator cmp = statistics.getComparator(); + Binary min = statistics.getMin(); + Binary max = statistics.getMax(); + return (cmp.compare(min, A) >= 0 && cmp.compare(max, B) < 0) + || (cmp.compare(min, E) >= 0 && cmp.compare(max, F) < 0) + || (cmp.compare(min, I) >= 0 && cmp.compare(max, J) < 0) + || (cmp.compare(min, O) >= 0 && cmp.compare(max, P) < 0) + || (cmp.compare(min, U) >= 0 && cmp.compare(max, V) < 0); + } + } + + public static class IsDivisibleBy extends UserDefinedPredicate implements Serializable { + private long divisor; + + IsDivisibleBy(long divisor) { + this.divisor = divisor; + } + + @Override + public boolean keep(Long value) { + return value != null && value % divisor == 0; + } + + @Override + public boolean canDrop(Statistics statistics) { + long min = statistics.getMin(); + long max = statistics.getMax(); + return min % divisor != 0 && max % divisor != 0 && min / divisor == max / divisor; + } + + @Override + public boolean inverseCanDrop(Statistics statistics) { + long min = statistics.getMin(); + long max = statistics.getMax(); + return min == max && min % divisor == 0; + } + } + + @Test + public void testUDF() throws IOException { + assertCorrectFiltering( + record -> NameStartsWithVowel.isStartingWithVowel(record.getName()) || record.getId() % 234 == 0, + or(userDefined(binaryColumn("name"), NameStartsWithVowel.class), + userDefined(longColumn("id"), new IsDivisibleBy(234)))); + assertCorrectFiltering( + record -> !(NameStartsWithVowel.isStartingWithVowel(record.getName()) || record.getId() % 234 == 0), + not(or(userDefined(binaryColumn("name"), NameStartsWithVowel.class), + userDefined(longColumn("id"), new IsDivisibleBy(234))))); + } + + @Test + public void testFilteringWithMissingColumns() throws IOException { + // Missing column filter is always true + assertEquals(DATA, readUsers(notEq(binaryColumn("not-existing-binary"), Binary.EMPTY), true)); + assertCorrectFiltering( + record -> record.getId() == 1234, + and(eq(longColumn("id"), 1234l), + eq(longColumn("not-existing-long"), null))); + assertCorrectFiltering( + record -> "miller".equals(record.getName()), + and(eq(binaryColumn("name"), Binary.fromString("miller")), + invert(userDefined(binaryColumn("not-existing-binary"), NameStartsWithVowel.class)))); + + // Missing column filter is always false + assertEquals(emptyList(), readUsers(lt(longColumn("not-existing-long"), 0l), true)); + assertCorrectFiltering( + record -> "miller".equals(record.getName()), + or(eq(binaryColumn("name"), Binary.fromString("miller")), + gtEq(binaryColumn("not-existing-binary"), Binary.EMPTY))); + assertCorrectFiltering( + record -> record.getId() == 1234, + or(eq(longColumn("id"), 1234l), + userDefined(longColumn("not-existing-long"), new IsDivisibleBy(1)))); + } +} diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java index 535394b370..e4a1d350cc 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java @@ -24,8 +24,12 @@ import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; +import org.apache.parquet.ParquetReadOptions; import org.apache.parquet.Version; import org.apache.parquet.bytes.BytesUtils; +import org.apache.parquet.column.values.bloomfilter.BlockSplitBloomFilter; +import org.apache.parquet.column.values.bloomfilter.BloomFilter; +import org.apache.parquet.column.values.bloomfilter.BloomFilterReader; import org.apache.parquet.hadoop.ParquetOutputFormat.JobSummaryLevel; import org.junit.Assume; import org.junit.Rule; @@ -39,10 +43,13 @@ import org.apache.parquet.column.page.PageReader; import org.apache.parquet.column.statistics.BinaryStatistics; import org.apache.parquet.column.statistics.LongStatistics; -import org.apache.parquet.column.values.bloomfilter.*; import org.apache.parquet.format.Statistics; import org.apache.parquet.hadoop.metadata.*; +import org.apache.parquet.hadoop.util.HadoopInputFile; import org.apache.parquet.hadoop.util.HiddenFileFilter; +import org.apache.parquet.internal.column.columnindex.BoundaryOrder; +import org.apache.parquet.internal.column.columnindex.ColumnIndex; +import org.apache.parquet.internal.column.columnindex.OffsetIndex; import org.apache.parquet.io.api.Binary; import org.apache.parquet.schema.MessageType; import org.apache.parquet.schema.MessageTypeParser; @@ -52,6 +59,8 @@ import java.io.File; import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; import java.util.*; import static org.apache.parquet.CorruptStatistics.shouldIgnoreStatistics; @@ -59,6 +68,7 @@ import static org.junit.Assert.*; import static org.apache.parquet.column.Encoding.BIT_PACKED; import static org.apache.parquet.column.Encoding.PLAIN; +import static org.apache.parquet.format.converter.ParquetMetadataConverter.MAX_STATS_SIZE; import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY; import static org.apache.parquet.schema.Type.Repetition.*; import static org.apache.parquet.hadoop.TestUtils.enforceEmptyDir; @@ -133,44 +143,6 @@ public void testWriteMode() throws Exception { testFile.delete(); } - @Test - public void testBloomWriteRead() throws Exception { - MessageType schema = MessageTypeParser.parseMessageType("message test { required binary foo; }"); - File testFile = temp.newFile(); - testFile.delete(); - - Path path = new Path(testFile.toURI()); - Configuration configuration = new Configuration(); - configuration.set("parquet.bloomFilter.filter.column.names", "foo"); - String colPath[] = {"foo"}; - ColumnDescriptor col = schema.getColumnDescription(colPath); - - BinaryStatistics stats1 = new BinaryStatistics(); - - ParquetFileWriter w = new ParquetFileWriter(configuration, schema, path); - w.start(); - w.startBlock(3); - w.startColumn(col, 5, CODEC); - w.writeDataPage(2, 4, BytesInput.from(BYTES1),stats1, BIT_PACKED, BIT_PACKED, PLAIN); - w.writeDataPage(3, 4, BytesInput.from(BYTES1),stats1, BIT_PACKED, BIT_PACKED, PLAIN); - BloomFilter bloomData = new BlockSplitBloomFilter(0); - bloomData.insert(bloomData.hash(Binary.fromString("hello"))); - bloomData.insert(bloomData.hash(Binary.fromString("world"))); - long blStarts = w.getPos(); - w.writeBloomFilter(bloomData); - w.endColumn(); - w.endBlock(); - w.end(new HashMap()); - ParquetMetadata readFooter = ParquetFileReader.readFooter(configuration, path); - assertEquals("bloomFilter offset", blStarts, readFooter.getBlocks().get(0).getColumns().get(0).getBloomFilterOffset()); - ParquetFileReader r = new ParquetFileReader(configuration, readFooter.getFileMetaData(), path, - Arrays.asList(readFooter.getBlocks().get(0)), Arrays.asList(schema.getColumnDescription(colPath))); - BloomFilterReader bloomFilterReader = r.getBloomFilterDataReader(readFooter.getBlocks().get(0)); - BloomFilter bloomDataRead = bloomFilterReader.readBloomFilter(col); - assertTrue(bloomDataRead.find(bloomData.hash(Binary.fromString("hello")))); - assertTrue(bloomDataRead.find(bloomData.hash(Binary.fromString("world")))); - } - @Test public void testWriteRead() throws Exception { File testFile = temp.newFile(); @@ -250,6 +222,42 @@ public void testWriteRead() throws Exception { PrintFooter.main(new String[] {path.toString()}); } + @Test + public void testBloomWriteRead() throws Exception { + MessageType schema = MessageTypeParser.parseMessageType("message test { required binary foo; }"); + File testFile = temp.newFile(); + testFile.delete(); + Path path = new Path(testFile.toURI()); + Configuration configuration = new Configuration(); + configuration.set("parquet.bloomFilter.filter.column.names", "foo"); + String colPath[] = {"foo"}; + ColumnDescriptor col = schema.getColumnDescription(colPath); + BinaryStatistics stats1 = new BinaryStatistics(); + ParquetFileWriter w = new ParquetFileWriter(configuration, schema, path); + w.start(); + w.startBlock(3); + w.startColumn(col, 5, CODEC); + w.writeDataPage(2, 4, BytesInput.from(BYTES1),stats1, BIT_PACKED, BIT_PACKED, PLAIN); + w.writeDataPage(3, 4, BytesInput.from(BYTES1),stats1, BIT_PACKED, BIT_PACKED, PLAIN); + BloomFilter bloomData = new BlockSplitBloomFilter(0); + bloomData.insert(bloomData.hash(Binary.fromString("hello"))); + bloomData.insert(bloomData.hash(Binary.fromString("world"))); + long blStarts = w.getPos(); + w.writeBloomFilter(bloomData); + w.endColumn(); + w.endBlock(); + w.end(new HashMap()); + ParquetMetadata readFooter = ParquetFileReader.readFooter(configuration, path); + assertEquals("bloomFilter offset", + blStarts, readFooter.getBlocks().get(0).getColumns().get(0).getBloomFilterOffset()); + ParquetFileReader r = new ParquetFileReader(configuration, readFooter.getFileMetaData(), path, + Arrays.asList(readFooter.getBlocks().get(0)), Arrays.asList(schema.getColumnDescription(colPath))); + BloomFilterReader bloomFilterReader = r.getBloomFilterDataReader(readFooter.getBlocks().get(0)); + BloomFilter bloomDataRead = bloomFilterReader.readBloomFilter(col); + assertTrue(bloomDataRead.find(bloomData.hash(Binary.fromString("hello")))); + assertTrue(bloomDataRead.find(bloomData.hash(Binary.fromString("world")))); + } + @Test public void testAlignmentWithPadding() throws Exception { File testFile = temp.newFile(); @@ -805,4 +813,142 @@ public void testWriteMetadataFileWithRelativeOutputPath() throws IOException { ParquetFileWriter.writeMetadataFile(conf, relativeRoot, footers, JobSummaryLevel.ALL); } + @Test + public void testColumnIndexWriteRead() throws Exception { + File testFile = temp.newFile(); + testFile.delete(); + + Path path = new Path(testFile.toURI()); + Configuration configuration = new Configuration(); + + ParquetFileWriter w = new ParquetFileWriter(configuration, SCHEMA, path); + w.start(); + w.startBlock(4); + w.startColumn(C1, 7, CODEC); + w.writeDataPage(7, 4, BytesInput.from(BYTES3), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN); + w.endColumn(); + w.startColumn(C2, 8, CODEC); + w.writeDataPage(8, 4, BytesInput.from(BYTES4), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN); + w.endColumn(); + w.endBlock(); + w.startBlock(4); + w.startColumn(C1, 5, CODEC); + long c1p1Starts = w.getPos(); + w.writeDataPage(2, 4, BytesInput.from(BYTES1), statsC1(null, Binary.fromString("aaa")), 1, BIT_PACKED, BIT_PACKED, + PLAIN); + long c1p2Starts = w.getPos(); + w.writeDataPage(3, 4, BytesInput.from(BYTES1), statsC1(Binary.fromString("bbb"), Binary.fromString("ccc")), 3, + BIT_PACKED, BIT_PACKED, PLAIN); + w.endColumn(); + long c1Ends = w.getPos(); + w.startColumn(C2, 6, CODEC); + long c2p1Starts = w.getPos(); + w.writeDataPage(2, 4, BytesInput.from(BYTES2), statsC2(117l, 100l), 1, BIT_PACKED, BIT_PACKED, PLAIN); + long c2p2Starts = w.getPos(); + w.writeDataPage(3, 4, BytesInput.from(BYTES2), statsC2(null, null, null), 2, BIT_PACKED, BIT_PACKED, PLAIN); + long c2p3Starts = w.getPos(); + w.writeDataPage(1, 4, BytesInput.from(BYTES2), statsC2(0l), 1, BIT_PACKED, BIT_PACKED, PLAIN); + w.endColumn(); + long c2Ends = w.getPos(); + w.endBlock(); + w.startBlock(4); + w.startColumn(C1, 7, CODEC); + w.writeDataPage(7, 4, BytesInput.from(BYTES3), + // Creating huge stats so the column index will reach the limit and won't be written + statsC1( + Binary.fromConstantByteArray(new byte[(int) MAX_STATS_SIZE]), + Binary.fromConstantByteArray(new byte[1])), + 4, BIT_PACKED, BIT_PACKED, PLAIN); + w.endColumn(); + w.startColumn(C2, 8, CODEC); + w.writeDataPage(8, 4, BytesInput.from(BYTES4), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN); + w.endColumn(); + w.endBlock(); + w.end(new HashMap()); + + try (ParquetFileReader reader = new ParquetFileReader(HadoopInputFile.fromPath(path, configuration), + ParquetReadOptions.builder().build())) { + ParquetMetadata footer = reader.getFooter(); + assertEquals(3, footer.getBlocks().size()); + BlockMetaData blockMeta = footer.getBlocks().get(1); + assertEquals(2, blockMeta.getColumns().size()); + + ColumnIndex columnIndex = reader.readColumnIndex(blockMeta.getColumns().get(0)); + assertEquals(BoundaryOrder.ASCENDING, columnIndex.getBoundaryOrder()); + assertTrue(Arrays.asList(1l, 0l).equals(columnIndex.getNullCounts())); + assertTrue(Arrays.asList(false, false).equals(columnIndex.getNullPages())); + List minValues = columnIndex.getMinValues(); + assertEquals(2, minValues.size()); + List maxValues = columnIndex.getMaxValues(); + assertEquals(2, maxValues.size()); + assertEquals("aaa", new String(minValues.get(0).array(), StandardCharsets.UTF_8)); + assertEquals("aaa", new String(maxValues.get(0).array(), StandardCharsets.UTF_8)); + assertEquals("bbb", new String(minValues.get(1).array(), StandardCharsets.UTF_8)); + assertEquals("ccc", new String(maxValues.get(1).array(), StandardCharsets.UTF_8)); + + columnIndex = reader.readColumnIndex(blockMeta.getColumns().get(1)); + assertEquals(BoundaryOrder.DESCENDING, columnIndex.getBoundaryOrder()); + assertTrue(Arrays.asList(0l, 3l, 0l).equals(columnIndex.getNullCounts())); + assertTrue(Arrays.asList(false, true, false).equals(columnIndex.getNullPages())); + minValues = columnIndex.getMinValues(); + assertEquals(3, minValues.size()); + maxValues = columnIndex.getMaxValues(); + assertEquals(3, maxValues.size()); + assertEquals(100, BytesUtils.bytesToLong(minValues.get(0).array())); + assertEquals(117, BytesUtils.bytesToLong(maxValues.get(0).array())); + assertEquals(0, minValues.get(1).array().length); + assertEquals(0, maxValues.get(1).array().length); + assertEquals(0, BytesUtils.bytesToLong(minValues.get(2).array())); + assertEquals(0, BytesUtils.bytesToLong(maxValues.get(2).array())); + + OffsetIndex offsetIndex = reader.readOffsetIndex(blockMeta.getColumns().get(0)); + assertEquals(2, offsetIndex.getPageCount()); + assertEquals(c1p1Starts, offsetIndex.getOffset(0)); + assertEquals(c1p2Starts, offsetIndex.getOffset(1)); + assertEquals(c1p2Starts - c1p1Starts, offsetIndex.getCompressedPageSize(0)); + assertEquals(c1Ends - c1p2Starts, offsetIndex.getCompressedPageSize(1)); + assertEquals(0, offsetIndex.getFirstRowIndex(0)); + assertEquals(1, offsetIndex.getFirstRowIndex(1)); + + offsetIndex = reader.readOffsetIndex(blockMeta.getColumns().get(1)); + assertEquals(3, offsetIndex.getPageCount()); + assertEquals(c2p1Starts, offsetIndex.getOffset(0)); + assertEquals(c2p2Starts, offsetIndex.getOffset(1)); + assertEquals(c2p3Starts, offsetIndex.getOffset(2)); + assertEquals(c2p2Starts - c2p1Starts, offsetIndex.getCompressedPageSize(0)); + assertEquals(c2p3Starts - c2p2Starts, offsetIndex.getCompressedPageSize(1)); + assertEquals(c2Ends - c2p3Starts, offsetIndex.getCompressedPageSize(2)); + assertEquals(0, offsetIndex.getFirstRowIndex(0)); + assertEquals(1, offsetIndex.getFirstRowIndex(1)); + assertEquals(3, offsetIndex.getFirstRowIndex(2)); + + assertNull(reader.readColumnIndex(footer.getBlocks().get(2).getColumns().get(0))); + } + } + + private org.apache.parquet.column.statistics.Statistics statsC1(Binary... values) { + org.apache.parquet.column.statistics.Statistics stats = org.apache.parquet.column.statistics.Statistics + .createStats(C1.getPrimitiveType()); + for (Binary value : values) { + if (value == null) { + stats.incrementNumNulls(); + } else { + stats.updateStats(value); + } + } + return stats; + } + + private org.apache.parquet.column.statistics.Statistics statsC2(Long... values) { + org.apache.parquet.column.statistics.Statistics stats = org.apache.parquet.column.statistics.Statistics + .createStats(C2.getPrimitiveType()); + for (Long value : values) { + if (value == null) { + stats.incrementNumNulls(); + } else { + stats.updateStats(value); + } + } + return stats; + } } diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetWriterMergeBlocks.java b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetWriterMergeBlocks.java new file mode 100644 index 0000000000..a972238cbe --- /dev/null +++ b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetWriterMergeBlocks.java @@ -0,0 +1,280 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.hadoop; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.parquet.Preconditions; +import org.apache.parquet.column.statistics.Statistics; +import org.apache.parquet.example.data.Group; +import org.apache.parquet.example.data.simple.SimpleGroupFactory; +import org.apache.parquet.hadoop.example.ExampleParquetWriter; +import org.apache.parquet.hadoop.example.GroupReadSupport; +import org.apache.parquet.hadoop.metadata.BlockMetaData; +import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; +import org.apache.parquet.hadoop.metadata.CompressionCodecName; +import org.apache.parquet.hadoop.metadata.FileMetaData; +import org.apache.parquet.hadoop.metadata.ParquetMetadata; +import org.apache.parquet.hadoop.util.HadoopInputFile; +import org.apache.parquet.io.InputFile; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.Types; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.UUID; +import java.util.stream.Collectors; + +import static java.util.Arrays.asList; +import static org.apache.parquet.format.converter.ParquetMetadataConverter.NO_FILTER; +import static org.apache.parquet.hadoop.ParquetWriter.DEFAULT_PAGE_SIZE; +import static org.apache.parquet.schema.OriginalType.UTF8; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32; + +public class TestParquetWriterMergeBlocks { + + @Rule + public TemporaryFolder temp = new TemporaryFolder(); + + public static final int FILE_SIZE = 10000; + public static final Configuration CONF = new Configuration(); + public static final Map EMPTY_METADATA = + new HashMap(); + public static final MessageType FILE_SCHEMA = Types.buildMessage() + .required(INT32).named("id") + .required(BINARY).as(UTF8).named("string") + .named("AppendTest"); + public static final SimpleGroupFactory GROUP_FACTORY = + new SimpleGroupFactory(FILE_SCHEMA); + + public Path file1; + public List file1content = new ArrayList(); + public Path file2; + public List file2content = new ArrayList(); + + @Before + public void createSourceData() throws IOException { + this.file1 = newTemp(); + this.file2 = newTemp(); + + ParquetWriter writer1 = ExampleParquetWriter.builder(file1) + .withType(FILE_SCHEMA) + .build(); + ParquetWriter writer2 = ExampleParquetWriter.builder(file2) + .withType(FILE_SCHEMA) + .build(); + + for (int i = 0; i < FILE_SIZE; i += 1) { + Group group1 = GROUP_FACTORY.newGroup(); + group1.add("id", i); + group1.add("string", UUID.randomUUID().toString()); + writer1.write(group1); + file1content.add(group1); + + Group group2 = GROUP_FACTORY.newGroup(); + group2.add("id", FILE_SIZE+i); + group2.add("string", UUID.randomUUID().toString()); + writer2.write(group2); + file2content.add(group2); + } + + writer1.close(); + writer2.close(); + } + + @Test + public void testBasicBehavior() throws IOException { + Path combinedFile = newTemp(); + ParquetFileWriter writer = new ParquetFileWriter( + CONF, FILE_SCHEMA, combinedFile); + + // Merge schema and extraMeta + List inputFiles = asList(file1, file2); + FileMetaData mergedMeta = ParquetFileWriter.mergeMetadataFiles(inputFiles, CONF).getFileMetaData(); + List inputFileList = toInputFiles(inputFiles); + CodecFactory.BytesCompressor compressor = new CodecFactory(CONF, DEFAULT_PAGE_SIZE).getCompressor(CompressionCodecName.SNAPPY); + + writer.merge(inputFileList, compressor, mergedMeta.getCreatedBy(), 128 * 1024 * 1024); + + LinkedList expected = new LinkedList<>(); + expected.addAll(file1content); + expected.addAll(file2content); + + ParquetReader reader = ParquetReader + .builder(new GroupReadSupport(), combinedFile) + .build(); + + Group next; + while ((next = reader.read()) != null) { + Group expectedNext = expected.removeFirst(); + // check each value; equals is not supported for simple records + Assert.assertEquals("Each id should match", + expectedNext.getInteger("id", 0), next.getInteger("id", 0)); + Assert.assertEquals("Each string should match", + expectedNext.getString("string", 0), next.getString("string", 0)); + } + + Assert.assertEquals("All records should be present", 0, expected.size()); + } + + private List toInputFiles(List inputFiles) { + return inputFiles.stream() + .map(input -> { + try { + return HadoopInputFile.fromPath(input, CONF); + } catch (Exception e) { + throw new RuntimeException(e); + } + }).collect(Collectors.toList()); + } + + @Test + public void testMergedMetadata() throws IOException { + Path combinedFile = newTemp(); + ParquetFileWriter writer = new ParquetFileWriter( + CONF, FILE_SCHEMA, combinedFile); + + // Merge schema and extraMeta + List inputFiles = asList(file1, file2); + FileMetaData mergedMeta = ParquetFileWriter.mergeMetadataFiles(inputFiles, CONF).getFileMetaData(); + List inputFileList = toInputFiles(inputFiles); + CompressionCodecName codecName = CompressionCodecName.GZIP; + CodecFactory.BytesCompressor compressor = new CodecFactory(CONF, DEFAULT_PAGE_SIZE).getCompressor(codecName); + writer.merge(inputFileList, compressor, mergedMeta.getCreatedBy(), 128 * 1024 * 1024); + + ParquetMetadata combinedFooter = ParquetFileReader.readFooter( + CONF, combinedFile, NO_FILTER); + ParquetMetadata f1Footer = ParquetFileReader.readFooter( + CONF, file1, NO_FILTER); + ParquetMetadata f2Footer = ParquetFileReader.readFooter( + CONF, file2, NO_FILTER); + + LinkedList expectedRowGroups = new LinkedList<>(); + expectedRowGroups.addAll(f1Footer.getBlocks()); + expectedRowGroups.addAll(f2Footer.getBlocks()); + long totalRowCount = expectedRowGroups.stream().mapToLong(BlockMetaData::getRowCount).sum(); + Assert.assertEquals("Combined should have a single row group", + 1, + combinedFooter.getBlocks().size()); + + BlockMetaData rowGroup = combinedFooter.getBlocks().get(0); + Assert.assertEquals("Row count should match", + totalRowCount, rowGroup.getRowCount()); + assertColumnsEquivalent(f1Footer.getBlocks().get(0).getColumns(), rowGroup.getColumns(), codecName); + } + + public void assertColumnsEquivalent(List expected, + List actual, + CompressionCodecName codecName) { + Assert.assertEquals("Should have the expected columns", + expected.size(), actual.size()); + for (int i = 0; i < actual.size(); i += 1) { + long numNulls = 0; + long valueCount = 0; + ColumnChunkMetaData current = actual.get(i); + Statistics statistics = current.getStatistics(); + numNulls += statistics.getNumNulls(); + valueCount += current.getValueCount(); + if (i != 0) { + ColumnChunkMetaData previous = actual.get(i - 1); + long expectedStart = previous.getStartingPos() + previous.getTotalSize(); + Assert.assertEquals("Should start after the previous column", + expectedStart, current.getStartingPos()); + } + + assertColumnMetadataEquivalent(expected.get(i), current, codecName, numNulls, valueCount); + } + } + + public void assertColumnMetadataEquivalent(ColumnChunkMetaData expected, + ColumnChunkMetaData actual, + CompressionCodecName codecName, + long numNulls, + long valueCount) { + Assert.assertEquals("Should be the expected column", + expected.getPath(), expected.getPath()); + Assert.assertEquals("Primitive type should not change", + expected.getType(), actual.getType()); + Assert.assertEquals("Compression codec should not change", + codecName, actual.getCodec()); + Assert.assertEquals("Data encodings should not change", + expected.getEncodings(), actual.getEncodings()); + Assert.assertEquals("Statistics should not change", + numNulls, actual.getStatistics().getNumNulls()); + Assert.assertEquals("Number of values should not change", + valueCount, actual.getValueCount()); + + } + + @Test + public void testAllowDroppingColumns() throws IOException { + MessageType droppedColumnSchema = Types.buildMessage() + .required(BINARY).as(UTF8).named("string") + .named("AppendTest"); + + Path droppedColumnFile = newTemp(); + List inputFiles = asList(file1, file2); + ParquetFileWriter writer = new ParquetFileWriter( + CONF, droppedColumnSchema, droppedColumnFile); + List inputFileList = toInputFiles(inputFiles); + CompressionCodecName codecName = CompressionCodecName.GZIP; + CodecFactory.BytesCompressor compressor = new CodecFactory(CONF, DEFAULT_PAGE_SIZE).getCompressor(codecName); + writer.merge(inputFileList, compressor, "", 128*1024*1024); + + LinkedList expected = new LinkedList(); + expected.addAll(file1content); + expected.addAll(file2content); + + ParquetMetadata footer = ParquetFileReader.readFooter( + CONF, droppedColumnFile, NO_FILTER); + for (BlockMetaData rowGroup : footer.getBlocks()) { + Assert.assertEquals("Should have only the string column", + 1, rowGroup.getColumns().size()); + } + + ParquetReader reader = ParquetReader + .builder(new GroupReadSupport(), droppedColumnFile) + .build(); + + Group next; + while ((next = reader.read()) != null) { + Group expectedNext = expected.removeFirst(); + Assert.assertEquals("Each string should match", + expectedNext.getString("string", 0), next.getString("string", 0)); + } + + Assert.assertEquals("All records should be present", 0, expected.size()); + } + + private Path newTemp() throws IOException { + File file = temp.newFile(); + Preconditions.checkArgument(file.delete(), "Could not remove temp file"); + return new Path(file.toString()); + } +} diff --git a/parquet-hive/parquet-hive-storage-handler/src/main/java/org/apache/hadoop/hive/ql/io/parquet/convert/HiveSchemaConverter.java b/parquet-hive/parquet-hive-storage-handler/src/main/java/org/apache/hadoop/hive/ql/io/parquet/convert/HiveSchemaConverter.java index 5d3ab488b2..6d229a696f 100644 --- a/parquet-hive/parquet-hive-storage-handler/src/main/java/org/apache/hadoop/hive/ql/io/parquet/convert/HiveSchemaConverter.java +++ b/parquet-hive/parquet-hive-storage-handler/src/main/java/org/apache/hadoop/hive/ql/io/parquet/convert/HiveSchemaConverter.java @@ -1,4 +1,4 @@ -/* +/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -6,9 +6,9 @@ * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -30,12 +30,15 @@ import org.apache.parquet.schema.ConversionPatterns; import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.LogicalTypeAnnotation; import org.apache.parquet.schema.MessageType; -import org.apache.parquet.schema.OriginalType; import org.apache.parquet.schema.PrimitiveType; import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; import org.apache.parquet.schema.Type; import org.apache.parquet.schema.Type.Repetition; +import org.apache.parquet.schema.Types; + +import static org.apache.parquet.schema.LogicalTypeAnnotation.listType; public class HiveSchemaConverter { @@ -105,7 +108,7 @@ private static Type convertType(final String name, final TypeInfo typeInfo, fina // 1 anonymous element "array_element" private static GroupType convertArrayType(final String name, final ListTypeInfo typeInfo) { final TypeInfo subType = typeInfo.getListElementTypeInfo(); - return listWrapper(name, OriginalType.LIST, new GroupType(Repetition.REPEATED, + return listWrapper(name, listType(), new GroupType(Repetition.REPEATED, ParquetHiveSerDe.ARRAY.toString(), convertType("array_element", subType))); } @@ -127,8 +130,8 @@ private static GroupType convertMapType(final String name, final MapTypeInfo typ return ConversionPatterns.mapType(Repetition.OPTIONAL, name, keyType, valueType); } - private static GroupType listWrapper(final String name, final OriginalType originalType, + private static GroupType listWrapper(final String name, final LogicalTypeAnnotation logicalTypeAnnotation, final GroupType groupType) { - return new GroupType(Repetition.OPTIONAL, name, originalType, groupType); + return Types.optionalGroup().addField(groupType).as(logicalTypeAnnotation).named(name); } } diff --git a/parquet-pig/pom.xml b/parquet-pig/pom.xml index 3b7e5703fe..0d3f202c27 100644 --- a/parquet-pig/pom.xml +++ b/parquet-pig/pom.xml @@ -48,8 +48,8 @@ org.apache.parquet - parquet-format - ${parquet.format.version} + parquet-format-structures + ${project.version} org.apache.pig diff --git a/parquet-pig/src/main/java/org/apache/parquet/pig/PigSchemaConverter.java b/parquet-pig/src/main/java/org/apache/parquet/pig/PigSchemaConverter.java index 24f7ee8c9a..19356616ae 100644 --- a/parquet-pig/src/main/java/org/apache/parquet/pig/PigSchemaConverter.java +++ b/parquet-pig/src/main/java/org/apache/parquet/pig/PigSchemaConverter.java @@ -23,7 +23,10 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.List; +import java.util.Optional; +import org.apache.parquet.schema.LogicalTypeAnnotation; +import org.apache.parquet.schema.Types; import org.apache.pig.LoadPushDown.RequiredField; import org.apache.pig.LoadPushDown.RequiredFieldList; import org.apache.pig.data.DataType; @@ -38,7 +41,6 @@ import org.apache.parquet.schema.ConversionPatterns; import org.apache.parquet.schema.GroupType; import org.apache.parquet.schema.MessageType; -import org.apache.parquet.schema.OriginalType; import org.apache.parquet.schema.PrimitiveType; import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeNameConverter; @@ -47,6 +49,9 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import static java.util.Optional.of; +import static org.apache.parquet.schema.LogicalTypeAnnotation.stringType; + /** * Converts a Pig Schema into a Parquet schema @@ -205,7 +210,7 @@ private FieldSchema getSimpleFieldSchema(final String fieldName, Type parquetTyp throws FrontendException { final PrimitiveTypeName parquetPrimitiveTypeName = parquetType.asPrimitiveType().getPrimitiveTypeName(); - final OriginalType originalType = parquetType.getOriginalType(); + final LogicalTypeAnnotation logicalTypeAnnotation = parquetType.getLogicalTypeAnnotation(); return parquetPrimitiveTypeName.convert( new PrimitiveTypeNameConverter() { @Override @@ -242,7 +247,7 @@ public FieldSchema convertINT96(PrimitiveTypeName primitiveTypeName) @Override public FieldSchema convertFIXED_LEN_BYTE_ARRAY( PrimitiveTypeName primitiveTypeName) throws FrontendException { - if (originalType == OriginalType.DECIMAL) { + if (logicalTypeAnnotation instanceof LogicalTypeAnnotation.DecimalLogicalTypeAnnotation) { return new FieldSchema(fieldName, null, DataType.BIGDECIMAL); } else { return new FieldSchema(fieldName, null, DataType.BYTEARRAY); @@ -258,7 +263,7 @@ public FieldSchema convertBOOLEAN(PrimitiveTypeName primitiveTypeName) @Override public FieldSchema convertBINARY(PrimitiveTypeName primitiveTypeName) throws FrontendException { - if (originalType != null && originalType == OriginalType.UTF8) { + if (logicalTypeAnnotation instanceof LogicalTypeAnnotation.StringLogicalTypeAnnotation) { return new FieldSchema(fieldName, null, DataType.CHARARRAY); } else { return new FieldSchema(fieldName, null, DataType.BYTEARRAY); @@ -267,47 +272,71 @@ public FieldSchema convertBINARY(PrimitiveTypeName primitiveTypeName) }); } + /* + * RuntimeException class to workaround throwing checked FrontendException in logical type visitors. + * Wrap the FrontendException inside the visitor in an inner catch block, and rethrow it outside of the visitor + */ + private static final class FrontendExceptionWrapper extends RuntimeException { + final FrontendException frontendException; + + FrontendExceptionWrapper(FrontendException frontendException) { + this.frontendException = frontendException; + } + } + private FieldSchema getComplexFieldSchema(String fieldName, Type parquetType) throws FrontendException { GroupType parquetGroupType = parquetType.asGroupType(); - OriginalType originalType = parquetGroupType.getOriginalType(); - if (originalType != null) { - switch(originalType) { - case MAP: - // verify that its a map - if (parquetGroupType.getFieldCount() != 1 || parquetGroupType.getType(0).isPrimitive()) { - throw new SchemaConversionException("Invalid map type " + parquetGroupType); - } - GroupType mapKeyValType = parquetGroupType.getType(0).asGroupType(); - if (!mapKeyValType.isRepetition(Repetition.REPEATED) || - (mapKeyValType.getOriginalType() != null && !mapKeyValType.getOriginalType().equals(OriginalType.MAP_KEY_VALUE)) || - mapKeyValType.getFieldCount()!=2) { - throw new SchemaConversionException("Invalid map type " + parquetGroupType); - } - // if value is not primitive wrap it in a tuple - Type valueType = mapKeyValType.getType(1); - Schema s = convertField(valueType); - s.getField(0).alias = null; - return new FieldSchema(fieldName, s, DataType.MAP); - case LIST: - Type type = parquetGroupType.getType(0); - if (parquetGroupType.getFieldCount()!= 1 || type.isPrimitive()) { - // an array is effectively a bag - Schema primitiveSchema = new Schema(getSimpleFieldSchema(parquetGroupType.getFieldName(0), type)); - Schema tupleSchema = new Schema(new FieldSchema(ARRAY_VALUE_NAME, primitiveSchema, DataType.TUPLE)); - return new FieldSchema(fieldName, tupleSchema, DataType.BAG); - } - GroupType tupleType = parquetGroupType.getType(0).asGroupType(); - if (!tupleType.isRepetition(Repetition.REPEATED)) { - throw new SchemaConversionException("Invalid list type " + parquetGroupType); - } - Schema tupleSchema = new Schema(new FieldSchema(tupleType.getName(), convertFields(tupleType.getFields()), DataType.TUPLE)); - return new FieldSchema(fieldName, tupleSchema, DataType.BAG); - case MAP_KEY_VALUE: - case ENUM: - case UTF8: - default: - throw new SchemaConversionException("Unexpected original type for " + parquetType + ": " + originalType); + LogicalTypeAnnotation logicalTypeAnnotation = parquetGroupType.getLogicalTypeAnnotation(); + if (logicalTypeAnnotation != null) { + try { + return logicalTypeAnnotation.accept(new LogicalTypeAnnotation.LogicalTypeAnnotationVisitor() { + @Override + public Optional visit(LogicalTypeAnnotation.MapLogicalTypeAnnotation mapLogicalType) { + try { + // verify that its a map + if (parquetGroupType.getFieldCount() != 1 || parquetGroupType.getType(0).isPrimitive()) { + throw new SchemaConversionException("Invalid map type " + parquetGroupType); + } + GroupType mapKeyValType = parquetGroupType.getType(0).asGroupType(); + if (!mapKeyValType.isRepetition(Repetition.REPEATED) || + (mapKeyValType.getLogicalTypeAnnotation() != null && !mapKeyValType.getLogicalTypeAnnotation().equals(LogicalTypeAnnotation.MapKeyValueTypeAnnotation.getInstance())) || + mapKeyValType.getFieldCount() != 2) { + throw new SchemaConversionException("Invalid map type " + parquetGroupType); + } + // if value is not primitive wrap it in a tuple + Type valueType = mapKeyValType.getType(1); + Schema s = convertField(valueType); + s.getField(0).alias = null; + return of(new FieldSchema(fieldName, s, DataType.MAP)); + } catch (FrontendException e) { + throw new FrontendExceptionWrapper(e); + } + } + + @Override + public Optional visit(LogicalTypeAnnotation.ListLogicalTypeAnnotation listLogicalType) { + try { + Type type = parquetGroupType.getType(0); + if (parquetGroupType.getFieldCount() != 1 || type.isPrimitive()) { + // an array is effectively a bag + Schema primitiveSchema = new Schema(getSimpleFieldSchema(parquetGroupType.getFieldName(0), type)); + Schema tupleSchema = new Schema(new FieldSchema(ARRAY_VALUE_NAME, primitiveSchema, DataType.TUPLE)); + return of(new FieldSchema(fieldName, tupleSchema, DataType.BAG)); + } + GroupType tupleType = parquetGroupType.getType(0).asGroupType(); + if (!tupleType.isRepetition(Repetition.REPEATED)) { + throw new SchemaConversionException("Invalid list type " + parquetGroupType); + } + Schema tupleSchema = new Schema(new FieldSchema(tupleType.getName(), convertFields(tupleType.getFields()), DataType.TUPLE)); + return of(new FieldSchema(fieldName, tupleSchema, DataType.BAG)); + } catch (FrontendException e) { + throw new FrontendExceptionWrapper(e); + } + } + }).orElseThrow(() -> new SchemaConversionException("Unexpected original type for " + parquetType + ": " + logicalTypeAnnotation)); + } catch (FrontendExceptionWrapper e) { + throw e.frontendException; } } else { // if original type is not set, we assume it to be tuple @@ -359,7 +388,7 @@ private Type convertWithName(FieldSchema fieldSchema, String name) { case DataType.BOOLEAN: return primitive(name, PrimitiveTypeName.BOOLEAN); case DataType.CHARARRAY: - return primitive(name, PrimitiveTypeName.BINARY, OriginalType.UTF8); + return primitive(name, PrimitiveTypeName.BINARY, stringType()); case DataType.INTEGER: return primitive(name, PrimitiveTypeName.INT32); case DataType.LONG: @@ -403,12 +432,12 @@ private String name(String fieldAlias, String defaultName) { return fieldAlias == null ? defaultName : fieldAlias; } - private Type primitive(String name, PrimitiveTypeName primitive, OriginalType originalType) { - return new PrimitiveType(Repetition.OPTIONAL, primitive, name, originalType); + private Type primitive(String name, PrimitiveTypeName primitive, LogicalTypeAnnotation logicalTypeAnnotation) { + return Types.primitive(primitive, Repetition.OPTIONAL).as(logicalTypeAnnotation).named(name); } private PrimitiveType primitive(String name, PrimitiveTypeName primitive) { - return new PrimitiveType(Repetition.OPTIONAL, primitive, name, null); + return Types.primitive(primitive, Repetition.OPTIONAL).named(name); } /** @@ -511,7 +540,8 @@ private Type filterBag(GroupType bagType, FieldSchema bagFieldSchema) throws Fro } Type nested = bagType.getType(0); FieldSchema innerField = bagFieldSchema.schema.getField(0); - if (nested.isPrimitive() || nested.getOriginalType() == OriginalType.MAP || nested.getOriginalType() == OriginalType.LIST) { + if (nested.isPrimitive() || nested.getLogicalTypeAnnotation() instanceof LogicalTypeAnnotation.MapLogicalTypeAnnotation + || nested.getLogicalTypeAnnotation() instanceof LogicalTypeAnnotation.ListLogicalTypeAnnotation) { // Bags always contain tuples => we skip the extra tuple that was inserted in that case. innerField = innerField.schema.getField(0); } diff --git a/parquet-pig/src/main/java/org/apache/parquet/pig/convert/TupleConverter.java b/parquet-pig/src/main/java/org/apache/parquet/pig/convert/TupleConverter.java index 18ea9e451e..48bb7539aa 100644 --- a/parquet-pig/src/main/java/org/apache/parquet/pig/convert/TupleConverter.java +++ b/parquet-pig/src/main/java/org/apache/parquet/pig/convert/TupleConverter.java @@ -1,4 +1,4 @@ -/* +/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -6,9 +6,9 @@ * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -23,6 +23,7 @@ import java.util.List; import java.math.BigDecimal; +import org.apache.parquet.schema.LogicalTypeAnnotation; import org.apache.pig.backend.executionengine.ExecException; import org.apache.pig.data.DataByteArray; import org.apache.pig.data.DataType; @@ -40,11 +41,8 @@ import org.apache.parquet.io.api.GroupConverter; import org.apache.parquet.io.api.PrimitiveConverter; import org.apache.parquet.pig.TupleConversionException; -import org.apache.parquet.pig.convert.DecimalUtils; import org.apache.parquet.schema.GroupType; -import org.apache.parquet.schema.OriginalType; import org.apache.parquet.schema.PrimitiveType; -import org.apache.parquet.schema.DecimalMetadata; import org.apache.parquet.schema.Type; import org.apache.parquet.schema.Type.Repetition; @@ -74,7 +72,7 @@ public TupleConverter(GroupType parquetSchema, Schema pigSchema, boolean elephan FieldSchema field = pigSchema.getField(i); if(parquetSchema.containsField(field.alias) || columnIndexAccess) { Type type = getType(columnIndexAccess, field.alias, i); - + if(type != null) { final int index = i; converters[c++] = newConverter(field, type, new ParentValueContainer() { @@ -85,7 +83,7 @@ void add(Object value) { }, elephantBirdCompatible, columnIndexAccess); } } - + } } catch (FrontendException e) { throw new ParquetDecodingException("can not initialize pig converter from:\n" + parquetSchema + "\n" + pigSchema, e); @@ -100,10 +98,10 @@ private Type getType(boolean columnIndexAccess, String alias, int index) { } else { return parquetSchema.getType(parquetSchema.getFieldIndex(alias)); } - + return null; } - + static Converter newConverter(FieldSchema pigField, Type type, final ParentValueContainer parent, boolean elephantBirdCompatible, boolean columnIndexAccess) { try { switch (pigField.type) { @@ -122,7 +120,7 @@ public void end() { case DataType.CHARARRAY: //If the orignal type isn't a string, we don't want to use the dictionary because //a custom implementation will be needed for each type. Just default to no dictionary. - return new FieldStringConverter(parent, type.getOriginalType() == OriginalType.UTF8); + return new FieldStringConverter(parent, type.getLogicalTypeAnnotation() instanceof LogicalTypeAnnotation.StringLogicalTypeAnnotation); case DataType.BYTEARRAY: return new FieldByteArrayConverter(parent); case DataType.INTEGER: @@ -277,8 +275,6 @@ public void addDouble(double value) { public void addBoolean(boolean value) { parent.add(Boolean.toString(value)); } - - } /** @@ -403,7 +399,7 @@ final public void addLong(long value) { @Override public void addInt(int value) { - parent.add((long)value); + parent.add((long)value); } @Override @@ -425,7 +421,7 @@ public void addBoolean(boolean value) { public void addBinary(Binary value) { parent.add(Long.parseLong(value.toStringUsingUTF8())); } - + } /** @@ -511,8 +507,6 @@ public void addDouble(double value) { public void addBinary(Binary value) { parent.add(Boolean.parseBoolean(value.toStringUsingUTF8())); } - - } /** @@ -554,7 +548,8 @@ static class BagConverter extends GroupConverter { ParentValueContainer childsParent; FieldSchema pigField; - if (nestedType.isPrimitive() || nestedType.getOriginalType() == OriginalType.MAP || nestedType.getOriginalType() == OriginalType.LIST) { + if (nestedType.isPrimitive() || nestedType.getLogicalTypeAnnotation() instanceof LogicalTypeAnnotation.MapLogicalTypeAnnotation + || nestedType.getLogicalTypeAnnotation() instanceof LogicalTypeAnnotation.ListLogicalTypeAnnotation) { // Pig bags always contain tuples // In that case we need to wrap the value in an extra tuple childsParent = new ParentValueContainer() { diff --git a/parquet-protobuf/pom.xml b/parquet-protobuf/pom.xml index b6f4627b16..329046db78 100644 --- a/parquet-protobuf/pom.xml +++ b/parquet-protobuf/pom.xml @@ -86,6 +86,17 @@
+ + + + + org.apache.thrift + libthrift + ${format.thrift.version} + + + + lukasnalezenec diff --git a/parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoMessageConverter.java b/parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoMessageConverter.java index 979d78ea71..92d8b624d9 100644 --- a/parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoMessageConverter.java +++ b/parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoMessageConverter.java @@ -1,4 +1,4 @@ -/* +/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -6,9 +6,9 @@ * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -31,15 +31,17 @@ import org.apache.parquet.io.api.PrimitiveConverter; import org.apache.parquet.schema.GroupType; import org.apache.parquet.schema.IncompatibleSchemaModificationException; -import org.apache.parquet.schema.OriginalType; +import org.apache.parquet.schema.LogicalTypeAnnotation; import org.apache.parquet.schema.Type; import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.Optional; import java.util.Set; import static com.google.protobuf.Descriptors.FieldDescriptor.JavaType; +import static java.util.Optional.of; /** * Converts Protocol Buffer message (both top level and inner) to parquet. @@ -128,13 +130,22 @@ public void add(Object value) { }; } - if (OriginalType.LIST == parquetType.getOriginalType()) { - return new ListConverter(parentBuilder, fieldDescriptor, parquetType); + LogicalTypeAnnotation logicalTypeAnnotation = parquetType.getLogicalTypeAnnotation(); + if (logicalTypeAnnotation == null) { + return newScalarConverter(parent, parentBuilder, fieldDescriptor, parquetType); } - if (OriginalType.MAP == parquetType.getOriginalType()) { - return new MapConverter(parentBuilder, fieldDescriptor, parquetType); - } - return newScalarConverter(parent, parentBuilder, fieldDescriptor, parquetType); + + return logicalTypeAnnotation.accept(new LogicalTypeAnnotation.LogicalTypeAnnotationVisitor() { + @Override + public Optional visit(LogicalTypeAnnotation.ListLogicalTypeAnnotation listLogicalType) { + return of(new ListConverter(parentBuilder, fieldDescriptor, parquetType)); + } + + @Override + public Optional visit(LogicalTypeAnnotation.MapLogicalTypeAnnotation mapLogicalType) { + return of(new MapConverter(parentBuilder, fieldDescriptor, parquetType)); + } + }).orElse(newScalarConverter(parent, parentBuilder, fieldDescriptor, parquetType)); } private Converter newScalarConverter(ParentValueContainer pvc, Message.Builder parentBuilder, Descriptors.FieldDescriptor fieldDescriptor, Type parquetType) { @@ -376,9 +387,9 @@ final class ListConverter extends GroupConverter { private final Converter converter; public ListConverter(Message.Builder parentBuilder, Descriptors.FieldDescriptor fieldDescriptor, Type parquetType) { - OriginalType originalType = parquetType.getOriginalType(); - if (originalType != OriginalType.LIST || parquetType.isPrimitive()) { - throw new ParquetDecodingException("Expected LIST wrapper. Found: " + originalType + " instead."); + LogicalTypeAnnotation logicalTypeAnnotation = parquetType.getLogicalTypeAnnotation(); + if (!(logicalTypeAnnotation instanceof LogicalTypeAnnotation.ListLogicalTypeAnnotation) || parquetType.isPrimitive()) { + throw new ParquetDecodingException("Expected LIST wrapper. Found: " + logicalTypeAnnotation + " instead."); } GroupType rootWrapperType = parquetType.asGroupType(); @@ -435,9 +446,9 @@ final class MapConverter extends GroupConverter { private final Converter converter; public MapConverter(Message.Builder parentBuilder, Descriptors.FieldDescriptor fieldDescriptor, Type parquetType) { - OriginalType originalType = parquetType.getOriginalType(); - if (originalType != OriginalType.MAP) { - throw new ParquetDecodingException("Expected MAP wrapper. Found: " + originalType + " instead."); + LogicalTypeAnnotation logicalTypeAnnotation = parquetType.getLogicalTypeAnnotation(); + if (!(logicalTypeAnnotation instanceof LogicalTypeAnnotation.MapLogicalTypeAnnotation)) { + throw new ParquetDecodingException("Expected MAP wrapper. Found: " + logicalTypeAnnotation + " instead."); } Type parquetSchema; diff --git a/parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoSchemaConverter.java b/parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoSchemaConverter.java index 0e1aa20100..db5be1409f 100644 --- a/parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoSchemaConverter.java +++ b/parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoSchemaConverter.java @@ -6,9 +6,9 @@ * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -23,8 +23,8 @@ import com.google.protobuf.Descriptors.FieldDescriptor.JavaType; import com.google.protobuf.Message; import com.twitter.elephantbird.util.Protobufs; +import org.apache.parquet.schema.LogicalTypeAnnotation; import org.apache.parquet.schema.MessageType; -import org.apache.parquet.schema.OriginalType; import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; import org.apache.parquet.schema.Type; import org.apache.parquet.schema.Types; @@ -35,8 +35,10 @@ import java.util.List; -import static org.apache.parquet.schema.OriginalType.ENUM; -import static org.apache.parquet.schema.OriginalType.UTF8; +import static org.apache.parquet.schema.LogicalTypeAnnotation.enumType; +import static org.apache.parquet.schema.LogicalTypeAnnotation.listType; +import static org.apache.parquet.schema.LogicalTypeAnnotation.mapType; +import static org.apache.parquet.schema.LogicalTypeAnnotation.stringType; import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.*; /** @@ -101,20 +103,19 @@ private Builder>, GroupBuilder> addF ParquetType parquetType = getParquetType(descriptor); if (descriptor.isRepeated() && parquetSpecsCompliant) { // the old schema style did not include the LIST wrapper around repeated fields - return addRepeatedPrimitive(descriptor, parquetType.primitiveType, parquetType.originalType, builder); + return addRepeatedPrimitive(parquetType.primitiveType, parquetType.logicalTypeAnnotation, builder); } - return builder.primitive(parquetType.primitiveType, getRepetition(descriptor)).as(parquetType.originalType); + return builder.primitive(parquetType.primitiveType, getRepetition(descriptor)).as(parquetType.logicalTypeAnnotation); } - private Builder>, GroupBuilder> addRepeatedPrimitive(FieldDescriptor descriptor, - PrimitiveTypeName primitiveType, - OriginalType originalType, + private Builder>, GroupBuilder> addRepeatedPrimitive(PrimitiveTypeName primitiveType, + LogicalTypeAnnotation logicalTypeAnnotation, final GroupBuilder builder) { return builder - .group(Type.Repetition.OPTIONAL).as(OriginalType.LIST) + .group(Type.Repetition.OPTIONAL).as(listType()) .group(Type.Repetition.REPEATED) - .primitive(primitiveType, Type.Repetition.REQUIRED).as(originalType) + .primitive(primitiveType, Type.Repetition.REQUIRED).as(logicalTypeAnnotation) .named("element") .named("list"); } @@ -122,7 +123,7 @@ private Builder>, GroupBuilder> addR private GroupBuilder> addRepeatedMessage(FieldDescriptor descriptor, GroupBuilder builder) { GroupBuilder>>> result = builder - .group(Type.Repetition.OPTIONAL).as(OriginalType.LIST) + .group(Type.Repetition.OPTIONAL).as(listType()) .group(Type.Repetition.REPEATED) .group(Type.Repetition.OPTIONAL); @@ -156,9 +157,9 @@ private GroupBuilder> addMapField(FieldDescriptor descriptor ParquetType mapKeyParquetType = getParquetType(fields.get(0)); GroupBuilder>> group = builder - .group(Type.Repetition.OPTIONAL).as(OriginalType.MAP) // only optional maps are allowed in Proto3 + .group(Type.Repetition.OPTIONAL).as(mapType()) // only optional maps are allowed in Proto3 .group(Type.Repetition.REPEATED) // key_value wrapper - .primitive(mapKeyParquetType.primitiveType, Type.Repetition.REQUIRED).as(mapKeyParquetType.originalType).named("key"); + .primitive(mapKeyParquetType.primitiveType, Type.Repetition.REQUIRED).as(mapKeyParquetType.logicalTypeAnnotation).named("key"); return addField(fields.get(1), group).named("value") .named("key_value"); @@ -173,8 +174,8 @@ private ParquetType getParquetType(FieldDescriptor fieldDescriptor) { case DOUBLE: return ParquetType.of(DOUBLE); case BOOLEAN: return ParquetType.of(BOOLEAN); case FLOAT: return ParquetType.of(FLOAT); - case STRING: return ParquetType.of(BINARY, UTF8); - case ENUM: return ParquetType.of(BINARY, ENUM); + case STRING: return ParquetType.of(BINARY, stringType()); + case ENUM: return ParquetType.of(BINARY, enumType()); case BYTE_STRING: return ParquetType.of(BINARY); default: throw new UnsupportedOperationException("Cannot convert Protocol Buffer: unknown type " + javaType); @@ -183,15 +184,15 @@ private ParquetType getParquetType(FieldDescriptor fieldDescriptor) { private static class ParquetType { PrimitiveTypeName primitiveType; - OriginalType originalType; + LogicalTypeAnnotation logicalTypeAnnotation; - private ParquetType(PrimitiveTypeName primitiveType, OriginalType originalType) { + private ParquetType(PrimitiveTypeName primitiveType, LogicalTypeAnnotation logicalTypeAnnotation) { this.primitiveType = primitiveType; - this.originalType = originalType; + this.logicalTypeAnnotation = logicalTypeAnnotation; } - public static ParquetType of(PrimitiveTypeName primitiveType, OriginalType originalType) { - return new ParquetType(primitiveType, originalType); + public static ParquetType of(PrimitiveTypeName primitiveType, LogicalTypeAnnotation logicalTypeAnnotation) { + return new ParquetType(primitiveType, logicalTypeAnnotation); } public static ParquetType of(PrimitiveTypeName primitiveType) { diff --git a/parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoWriteSupport.java b/parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoWriteSupport.java index 59c236f312..7436b04c6e 100644 --- a/parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoWriteSupport.java +++ b/parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoWriteSupport.java @@ -1,4 +1,4 @@ -/* +/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -6,9 +6,9 @@ * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -38,6 +38,9 @@ import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.Optional; + +import static java.util.Optional.ofNullable; /** * Implementation of {@link WriteSupport} for writing Protocol Buffers. @@ -216,15 +219,21 @@ private FieldWriter createMessageWriter(FieldDescriptor fieldDescriptor, Type ty } private GroupType getGroupType(Type type) { - if (type.getOriginalType() == OriginalType.LIST) { - return type.asGroupType().getType("list").asGroupType().getType("element").asGroupType(); - } - - if (type.getOriginalType() == OriginalType.MAP) { - return type.asGroupType().getType("key_value").asGroupType().getType("value").asGroupType(); + LogicalTypeAnnotation logicalTypeAnnotation = type.getLogicalTypeAnnotation(); + if (logicalTypeAnnotation == null) { + return type.asGroupType(); } + return logicalTypeAnnotation.accept(new LogicalTypeAnnotation.LogicalTypeAnnotationVisitor() { + @Override + public Optional visit(LogicalTypeAnnotation.ListLogicalTypeAnnotation listLogicalType) { + return ofNullable(type.asGroupType().getType("list").asGroupType().getType("element").asGroupType()); + } - return type.asGroupType(); + @Override + public Optional visit(LogicalTypeAnnotation.MapLogicalTypeAnnotation mapLogicalType) { + return ofNullable(type.asGroupType().getType("key_value").asGroupType().getType("value").asGroupType()); + } + }).orElse(type.asGroupType()); } private MapWriter createMapWriter(FieldDescriptor fieldDescriptor, Type type) { diff --git a/parquet-thrift/pom.xml b/parquet-thrift/pom.xml index 51a6b9b17f..4340430b0a 100644 --- a/parquet-thrift/pom.xml +++ b/parquet-thrift/pom.xml @@ -144,6 +144,17 @@ + + + + + org.apache.thrift + libthrift + ${thrift.version} + + + + diff --git a/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftSchemaConvertVisitor.java b/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftSchemaConvertVisitor.java index 1185382e01..7bfcdb1adf 100644 --- a/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftSchemaConvertVisitor.java +++ b/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftSchemaConvertVisitor.java @@ -1,4 +1,4 @@ -/* +/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -6,9 +6,9 @@ * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -23,8 +23,8 @@ import org.apache.parquet.ShouldNeverHappenException; import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.LogicalTypeAnnotation; import org.apache.parquet.schema.MessageType; -import org.apache.parquet.schema.OriginalType; import org.apache.parquet.schema.PrimitiveType; import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; import org.apache.parquet.schema.Type; @@ -55,8 +55,8 @@ import static org.apache.parquet.Preconditions.checkNotNull; import static org.apache.parquet.schema.ConversionPatterns.listType; import static org.apache.parquet.schema.ConversionPatterns.mapType; -import static org.apache.parquet.schema.OriginalType.ENUM; -import static org.apache.parquet.schema.OriginalType.UTF8; +import static org.apache.parquet.schema.LogicalTypeAnnotation.enumType; +import static org.apache.parquet.schema.LogicalTypeAnnotation.stringType; import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY; import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BOOLEAN; import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.DOUBLE; @@ -278,7 +278,7 @@ private ConvertedField visitPrimitiveType(PrimitiveTypeName type, State state) { return visitPrimitiveType(type, null, state); } - private ConvertedField visitPrimitiveType(PrimitiveTypeName type, OriginalType orig, State state) { + private ConvertedField visitPrimitiveType(PrimitiveTypeName type, LogicalTypeAnnotation orig, State state) { PrimitiveBuilder b = primitive(type, state.repetition); if (orig != null) { @@ -294,7 +294,7 @@ private ConvertedField visitPrimitiveType(PrimitiveTypeName type, OriginalType o @Override public ConvertedField visit(EnumType enumType, State state) { - return visitPrimitiveType(BINARY, ENUM, state); + return visitPrimitiveType(BINARY, enumType(), state); } @Override @@ -329,7 +329,7 @@ public ConvertedField visit(I64Type i64Type, State state) { @Override public ConvertedField visit(StringType stringType, State state) { - return stringType.isBinary() ? visitPrimitiveType(BINARY, state) : visitPrimitiveType(BINARY, UTF8, state); + return stringType.isBinary() ? visitPrimitiveType(BINARY, state) : visitPrimitiveType(BINARY, stringType(), state); } private static boolean isUnion(StructOrUnionType s) { diff --git a/parquet-tools/pom.xml b/parquet-tools/pom.xml index 566f8f1c31..32ee4d8eda 100644 --- a/parquet-tools/pom.xml +++ b/parquet-tools/pom.xml @@ -48,8 +48,8 @@ org.apache.parquet - parquet-format - ${parquet.format.version} + parquet-format-structures + ${project.version} org.apache.parquet diff --git a/parquet-tools/src/main/java/org/apache/parquet/tools/command/ColumnIndexCommand.java b/parquet-tools/src/main/java/org/apache/parquet/tools/command/ColumnIndexCommand.java new file mode 100644 index 0000000000..cbbd8a1faa --- /dev/null +++ b/parquet-tools/src/main/java/org/apache/parquet/tools/command/ColumnIndexCommand.java @@ -0,0 +1,182 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.tools.command; + +import java.io.PrintWriter; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.Option; +import org.apache.commons.cli.Options; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.metadata.BlockMetaData; +import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; +import org.apache.parquet.hadoop.util.HadoopInputFile; +import org.apache.parquet.internal.column.columnindex.ColumnIndex; +import org.apache.parquet.internal.column.columnindex.OffsetIndex; +import org.apache.parquet.io.InputFile; +import org.apache.parquet.tools.Main; + +/** + * parquet-tools command to print column and offset indexes. + */ +public class ColumnIndexCommand extends ArgsOnlyCommand { + public static final String[] USAGE = new String[] { + "", + "where is the parquet file to print the column and offset indexes for" + }; + + public static final Options OPTIONS; + static { + OPTIONS = new Options(); + OPTIONS.addOption(Option.builder("c") + .longOpt("column") + .desc("Shows the column/offset indexes for the given column only; " + + "multiple columns shall be separated by commas") + .hasArg() + .build()); + OPTIONS.addOption(Option.builder("r") + .longOpt("row-group") + .desc("Shows the column/offset indexes for the given row-groups only; " + + "multiple row-groups shall be speparated by commas; " + + "row-groups are referenced by their indexes from 0") + .hasArg() + .build()); + OPTIONS.addOption(Option.builder("i") + .longOpt("column-index") + .desc("Shows the column indexes; " + + "active by default unless -o is used") + .hasArg(false) + .build()); + OPTIONS.addOption(Option.builder("o") + .longOpt("offset-index") + .desc("Shows the offset indexes; " + + "active by default unless -i is used") + .hasArg(false) + .build()); + } + + public ColumnIndexCommand() { + super(1, 1); + } + + @Override + public String[] getUsageDescription() { + return USAGE; + } + + @Override + public String getCommandDescription() { + return "Prints the column and offset indexes of a Parquet file."; + } + + @Override + public Options getOptions() { + return OPTIONS; + } + + @Override + public void execute(CommandLine options) throws Exception { + super.execute(options); + + String[] args = options.getArgs(); + InputFile in = HadoopInputFile.fromPath(new Path(args[0]), new Configuration()); + PrintWriter out = new PrintWriter(Main.out, true); + String rowGroupValue = options.getOptionValue("r"); + Set indexes = new HashSet<>(); + if (rowGroupValue != null) { + indexes.addAll(Arrays.asList(rowGroupValue.split("\\s*,\\s*"))); + } + boolean showColumnIndex = options.hasOption("i"); + boolean showOffsetIndex = options.hasOption("o"); + if (!showColumnIndex && !showOffsetIndex) { + showColumnIndex = true; + showOffsetIndex = true; + } + + try (ParquetFileReader reader = ParquetFileReader.open(in)) { + boolean firstBlock = true; + int rowGroupIndex = 0; + for (BlockMetaData block : reader.getFooter().getBlocks()) { + if (!indexes.isEmpty() && !indexes.contains(Integer.toString(rowGroupIndex))) { + ++rowGroupIndex; + continue; + } + if (!firstBlock) { + out.println(); + firstBlock = false; + } + out.format("row group %d:%n", rowGroupIndex); + for (ColumnChunkMetaData column : getColumns(block, options)) { + String path = column.getPath().toDotString(); + if (showColumnIndex) { + out.format("column index for column %s:%n", path); + ColumnIndex columnIndex = reader.readColumnIndex(column); + if (columnIndex == null) { + out.println("NONE"); + } else { + out.println(columnIndex); + } + } + if (showOffsetIndex) { + out.format("offset index for column %s:%n", path); + OffsetIndex offsetIndex = reader.readOffsetIndex(column); + if (offsetIndex == null) { + out.println("NONE"); + } else { + out.println(offsetIndex); + } + } + } + ++rowGroupIndex; + } + } + } + + private static List getColumns(BlockMetaData block, CommandLine options) { + List columns = block.getColumns(); + String pathValue = options.getOptionValue("c"); + if (pathValue == null) { + return columns; + } + String[] paths = pathValue.split("\\s*,\\s*"); + Map pathMap = new HashMap<>(); + for (ColumnChunkMetaData column : columns) { + pathMap.put(column.getPath().toDotString(), column); + } + + List filtered = new ArrayList<>(); + for (String path : paths) { + ColumnChunkMetaData column = pathMap.get(path); + if (column != null) { + filtered.add(column); + } + } + return filtered; + } + +} diff --git a/parquet-tools/src/main/java/org/apache/parquet/tools/command/DumpCommand.java b/parquet-tools/src/main/java/org/apache/parquet/tools/command/DumpCommand.java index 26b5562ff5..27043b9480 100644 --- a/parquet-tools/src/main/java/org/apache/parquet/tools/command/DumpCommand.java +++ b/parquet-tools/src/main/java/org/apache/parquet/tools/command/DumpCommand.java @@ -58,7 +58,6 @@ import org.apache.parquet.io.api.PrimitiveConverter; import org.apache.parquet.schema.MessageType; import org.apache.parquet.schema.PrimitiveStringifier; -import org.apache.parquet.tools.util.MetadataUtils; import org.apache.parquet.tools.util.PrettyPrintWriter; import org.apache.parquet.tools.util.PrettyPrintWriter.WhiteSpaceHandler; diff --git a/parquet-tools/src/main/java/org/apache/parquet/tools/command/MergeCommand.java b/parquet-tools/src/main/java/org/apache/parquet/tools/command/MergeCommand.java index fe64587560..6d5b31380f 100644 --- a/parquet-tools/src/main/java/org/apache/parquet/tools/command/MergeCommand.java +++ b/parquet-tools/src/main/java/org/apache/parquet/tools/command/MergeCommand.java @@ -19,20 +19,29 @@ package org.apache.parquet.tools.command; import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.Option; +import org.apache.commons.cli.Options; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; +import org.apache.parquet.hadoop.CodecFactory; +import org.apache.parquet.hadoop.metadata.CompressionCodecName; import org.apache.parquet.hadoop.util.HadoopInputFile; import org.apache.parquet.hadoop.util.HiddenFileFilter; import org.apache.parquet.hadoop.ParquetFileWriter; import org.apache.parquet.hadoop.metadata.FileMetaData; +import org.apache.parquet.io.InputFile; import org.apache.parquet.tools.Main; import java.io.IOException; import java.io.PrintWriter; import java.util.ArrayList; import java.util.List; +import java.util.stream.Collectors; + +import static org.apache.parquet.hadoop.ParquetWriter.DEFAULT_BLOCK_SIZE; +import static org.apache.parquet.hadoop.ParquetWriter.DEFAULT_PAGE_SIZE; public class MergeCommand extends ArgsOnlyCommand { public static final String[] USAGE = new String[] { @@ -49,12 +58,43 @@ public class MergeCommand extends ArgsOnlyCommand { private Configuration conf; + private static final Options OPTIONS; + static { + OPTIONS = new Options(); + + Option block = Option.builder("b") + .longOpt("block") + .desc("Merge adjacent blocks into one up to upper bound size limit default to 128 MB") + .build(); + + Option limit = Option.builder("l") + .longOpt("limit") + .desc("Upper bound for merged block size in megabytes. Default: 128 MB") + .hasArg() + .build(); + + Option codec = Option.builder("c") + .longOpt("codec") + .desc("Compression codec name. Default: SNAPPY. Valid values: UNCOMPRESSED, SNAPPY, GZIP, LZO, BROTLI, LZ4, ZSTD") + .hasArg() + .build(); + + OPTIONS.addOption(limit); + OPTIONS.addOption(block); + OPTIONS.addOption(codec); + } + public MergeCommand() { super(2, MAX_FILE_NUM + 1); conf = new Configuration(); } + @Override + public Options getOptions() { + return OPTIONS; + } + @Override public String[] getUsageDescription() { return USAGE; @@ -63,18 +103,32 @@ public String[] getUsageDescription() { @Override public String getCommandDescription() { return "Merges multiple Parquet files into one. " + - "The command doesn't merge row groups, just places one after the other. " + + "Without -b option the command doesn't merge row groups, just places one after the other. " + "When used to merge many small files, the resulting file will still contain small row groups, " + - "which usually leads to bad query performance."; + "which usually leads to bad query performance. " + + "To have adjacent small blocks merged together use -b option. " + + "Blocks will be grouped into larger one until the upper bound is reached. " + + "Default block upper bound 128 MB and default compression SNAPPY can be customized using -l and -c options"; } @Override public void execute(CommandLine options) throws Exception { + boolean mergeBlocks = options.hasOption('b'); + int maxBlockSize = options.hasOption('l')? Integer.parseInt(options.getOptionValue('l')) * 1024 * 1024 : DEFAULT_BLOCK_SIZE; + CompressionCodecName compressionCodec = options.hasOption('c') ? CompressionCodecName.valueOf(options.getOptionValue('c')) : CompressionCodecName.SNAPPY; // Prepare arguments List args = options.getArgList(); List inputFiles = getInputFiles(args.subList(0, args.size() - 1)); Path outputFile = new Path(args.get(args.size() - 1)); + if (mergeBlocks) { + CodecFactory.BytesCompressor compressor = new CodecFactory(conf, DEFAULT_PAGE_SIZE).getCompressor(compressionCodec); + mergeBlocks(maxBlockSize, compressor, inputFiles, outputFile); + } else { + mergeFiles(inputFiles, outputFile); + } + } + private void mergeFiles(List inputFiles, Path outputFile) throws IOException { // Merge schema and extraMeta FileMetaData mergedMeta = mergedMetadata(inputFiles); PrintWriter out = new PrintWriter(Main.out, true); @@ -103,6 +157,23 @@ public void execute(CommandLine options) throws Exception { writer.end(mergedMeta.getKeyValueMetaData()); } + private void mergeBlocks(int maxBlockSize, CodecFactory.BytesCompressor compressor, List inputFiles, Path outputFile) throws IOException { + // Merge schema and extraMeta + FileMetaData mergedMeta = mergedMetadata(inputFiles); + + // Merge data + ParquetFileWriter writer = new ParquetFileWriter(conf, mergedMeta.getSchema(), outputFile, ParquetFileWriter.Mode.CREATE); + List inputFileList = inputFiles.stream() + .map(input -> { + try { + return HadoopInputFile.fromPath(input, conf); + } catch (Exception e) { + throw new RuntimeException(e); + } + }).collect(Collectors.toList()); + writer.merge(inputFileList, compressor, mergedMeta.getCreatedBy(), maxBlockSize); + } + private FileMetaData mergedMetadata(List inputFiles) throws IOException { return ParquetFileWriter.mergeMetadataFiles(inputFiles, conf).getFileMetaData(); } diff --git a/parquet-tools/src/main/java/org/apache/parquet/tools/command/MetadataUtils.java b/parquet-tools/src/main/java/org/apache/parquet/tools/command/MetadataUtils.java new file mode 100644 index 0000000000..0bade37002 --- /dev/null +++ b/parquet-tools/src/main/java/org/apache/parquet/tools/command/MetadataUtils.java @@ -0,0 +1,212 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.tools.command; + +import com.google.common.base.Joiner; +import com.google.common.base.Strings; +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.column.statistics.Statistics; +import org.apache.parquet.hadoop.metadata.BlockMetaData; +import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; +import org.apache.parquet.hadoop.metadata.FileMetaData; +import org.apache.parquet.hadoop.metadata.ParquetMetadata; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.LogicalTypeAnnotation; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.OriginalType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; +import org.apache.parquet.schema.Type; +import org.apache.parquet.schema.Type.Repetition; +import org.apache.parquet.tools.util.PrettyPrintWriter; + +import java.util.ArrayList; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; + +class MetadataUtils { + static void showDetails(PrettyPrintWriter out, ParquetMetadata meta, boolean showOriginalTypes) { + showDetails(out, meta.getFileMetaData(), showOriginalTypes); + + long i = 1; + for (BlockMetaData bmeta : meta.getBlocks()) { + out.println(); + showDetails(out, bmeta, i++); + } + } + + static void showDetails(PrettyPrintWriter out, FileMetaData meta, boolean showOriginalTypes) { + out.format("creator: %s%n", meta.getCreatedBy()); + + Map extra = meta.getKeyValueMetaData(); + if (extra != null) { + for (Map.Entry entry : meta.getKeyValueMetaData().entrySet()) { + out.print("extra: "); + out.incrementTabLevel(); + out.format("%s = %s%n", entry.getKey(), entry.getValue()); + out.decrementTabLevel(); + } + } + + out.println(); + out.format("file schema: %s%n", meta.getSchema().getName()); + out.rule('-'); + showDetails(out, meta.getSchema(), showOriginalTypes); + } + + private static void showDetails(PrettyPrintWriter out, BlockMetaData meta, Long num) { + long rows = meta.getRowCount(); + long tbs = meta.getTotalByteSize(); + long offset = meta.getStartingPos(); + + out.format("row group%s: RC:%d TS:%d OFFSET:%d%n", (num == null ? "" : " " + num), rows, tbs, offset); + out.rule('-'); + showDetails(out, meta.getColumns()); + } + + static void showDetails(PrettyPrintWriter out, List ccmeta) { + Map chunks = new LinkedHashMap(); + for (ColumnChunkMetaData cmeta : ccmeta) { + String[] path = cmeta.getPath().toArray(); + + Map current = chunks; + for (int i = 0; i < path.length - 1; ++i) { + String next = path[i]; + if (!current.containsKey(next)) { + current.put(next, new LinkedHashMap()); + } + + current = (Map)current.get(next); + } + + current.put(path[path.length - 1], cmeta); + } + + showColumnChunkDetails(out, chunks, 0); + } + + private static void showColumnChunkDetails(PrettyPrintWriter out, Map current, int depth) { + for (Map.Entry entry : current.entrySet()) { + String name = Strings.repeat(".", depth) + entry.getKey(); + Object value = entry.getValue(); + + if (value instanceof Map) { + out.println(name + ": "); + showColumnChunkDetails(out, (Map)value, depth + 1); + } else { + out.print(name + ": "); + showDetails(out, (ColumnChunkMetaData)value, false); + } + } + } + + private static void showDetails(PrettyPrintWriter out, ColumnChunkMetaData meta, boolean name) { + long doff = meta.getDictionaryPageOffset(); + long foff = meta.getFirstDataPageOffset(); + long tsize = meta.getTotalSize(); + long usize = meta.getTotalUncompressedSize(); + long count = meta.getValueCount(); + double ratio = usize / (double)tsize; + String encodings = Joiner.on(',').skipNulls().join(meta.getEncodings()); + + if (name) { + String path = Joiner.on('.').skipNulls().join(meta.getPath()); + out.format("%s: ", path); + } + + out.format(" %s", meta.getType()); + out.format(" %s", meta.getCodec()); + out.format(" DO:%d", doff); + out.format(" FPO:%d", foff); + out.format(" SZ:%d/%d/%.2f", tsize, usize, ratio); + out.format(" VC:%d", count); + if (!encodings.isEmpty()) out.format(" ENC:%s", encodings); + Statistics stats = meta.getStatistics(); + if (stats != null) { + out.format(" ST:[%s]", stats); + } else { + out.format(" ST:[none]"); + } + out.println(); + } + + static void showDetails(PrettyPrintWriter out, MessageType type, boolean showOriginalTypes) { + List cpath = new ArrayList(); + for (Type ftype : type.getFields()) { + showDetails(out, ftype, 0, type, cpath, showOriginalTypes); + } + } + + private static void showDetails(PrettyPrintWriter out, GroupType type, int depth, MessageType container, List cpath, boolean showOriginalTypes) { + String name = Strings.repeat(".", depth) + type.getName(); + Repetition rep = type.getRepetition(); + int fcount = type.getFieldCount(); + out.format("%s: %s F:%d%n", name, rep, fcount); + + cpath.add(type.getName()); + for (Type ftype : type.getFields()) { + showDetails(out, ftype, depth + 1, container, cpath, showOriginalTypes); + } + cpath.remove(cpath.size() - 1); + } + + private static void showDetails(PrettyPrintWriter out, PrimitiveType type, int depth, MessageType container, List cpath, boolean showOriginalTypes) { + String name = Strings.repeat(".", depth) + type.getName(); + Repetition rep = type.getRepetition(); + PrimitiveTypeName ptype = type.getPrimitiveTypeName(); + + out.format("%s: %s %s", name, rep, ptype); + if (showOriginalTypes) { + OriginalType otype; + try { + otype = type.getOriginalType(); + } catch (Exception e) { + otype = null; + } + if (otype != null) out.format(" O:%s", otype); + } else { + LogicalTypeAnnotation ltype = type.getLogicalTypeAnnotation(); + if (ltype != null) out.format(" L:%s", ltype); + } + + if (container != null) { + cpath.add(type.getName()); + String[] paths = cpath.toArray(new String[cpath.size()]); + cpath.remove(cpath.size() - 1); + + ColumnDescriptor desc = container.getColumnDescription(paths); + + int defl = desc.getMaxDefinitionLevel(); + int repl = desc.getMaxRepetitionLevel(); + out.format(" R:%d D:%d", repl, defl); + } + out.println(); + } + + private static void showDetails(PrettyPrintWriter out, Type type, int depth, MessageType container, List cpath, boolean showOriginalTypes) { + if (type instanceof GroupType) { + showDetails(out, type.asGroupType(), depth, container, cpath, showOriginalTypes); + return; + } else if (type instanceof PrimitiveType) { + showDetails(out, type.asPrimitiveType(), depth, container, cpath, showOriginalTypes); + return; + } + } +} diff --git a/parquet-tools/src/main/java/org/apache/parquet/tools/command/Registry.java b/parquet-tools/src/main/java/org/apache/parquet/tools/command/Registry.java index 6df84be37a..399efb7316 100644 --- a/parquet-tools/src/main/java/org/apache/parquet/tools/command/Registry.java +++ b/parquet-tools/src/main/java/org/apache/parquet/tools/command/Registry.java @@ -34,6 +34,7 @@ public final class Registry { registry.put("merge", MergeCommand.class); registry.put("rowcount", RowCountCommand.class); registry.put("size", SizeCommand.class); + registry.put("column-index", ColumnIndexCommand.class); } public static Map allCommands() { diff --git a/parquet-tools/src/main/java/org/apache/parquet/tools/command/ShowMetaCommand.java b/parquet-tools/src/main/java/org/apache/parquet/tools/command/ShowMetaCommand.java index 8d35551525..b07fa7a693 100644 --- a/parquet-tools/src/main/java/org/apache/parquet/tools/command/ShowMetaCommand.java +++ b/parquet-tools/src/main/java/org/apache/parquet/tools/command/ShowMetaCommand.java @@ -1,4 +1,4 @@ -/* +/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -6,9 +6,9 @@ * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -19,13 +19,15 @@ package org.apache.parquet.tools.command; import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.Option; +import org.apache.commons.cli.OptionBuilder; +import org.apache.commons.cli.Options; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.Path; import org.apache.parquet.hadoop.Footer; import org.apache.parquet.hadoop.ParquetFileReader; -import org.apache.parquet.tools.util.MetadataUtils; import org.apache.parquet.tools.util.PrettyPrintWriter; import org.apache.parquet.tools.util.PrettyPrintWriter.WhiteSpaceHandler; @@ -37,6 +39,15 @@ public class ShowMetaCommand extends ArgsOnlyCommand { "where is the parquet file to print to stdout" }; + public static final Options OPTIONS; + static { + OPTIONS = new Options(); + Option originalType = OptionBuilder.withLongOpt("originalType") + .withDescription("Print logical types in OriginalType representation.") + .create('o'); + OPTIONS.addOption(originalType); + } + public ShowMetaCommand() { super(1, 1); } @@ -51,13 +62,19 @@ public String getCommandDescription() { return "Prints the metadata of Parquet file(s)"; } + @Override + public Options getOptions() { + return OPTIONS; + } + @Override public void execute(CommandLine options) throws Exception { super.execute(options); String[] args = options.getArgs(); String input = args[0]; - + boolean showOriginalTypes = options.hasOption('o'); + Configuration conf = new Configuration(); Path inputPath = new Path(input); FileStatus inputFileStatus = inputPath.getFileSystem(conf).getFileStatus(inputPath); @@ -71,7 +88,7 @@ public void execute(CommandLine options) throws Exception { for(Footer f: footers) { out.format("file: %s%n" , f.getFile()); - MetadataUtils.showDetails(out, f.getParquetMetadata()); + MetadataUtils.showDetails(out, f.getParquetMetadata(), showOriginalTypes); out.flushColumns(); } } diff --git a/parquet-tools/src/main/java/org/apache/parquet/tools/command/ShowSchemaCommand.java b/parquet-tools/src/main/java/org/apache/parquet/tools/command/ShowSchemaCommand.java index d83e5649e4..6f83857b3b 100644 --- a/parquet-tools/src/main/java/org/apache/parquet/tools/command/ShowSchemaCommand.java +++ b/parquet-tools/src/main/java/org/apache/parquet/tools/command/ShowSchemaCommand.java @@ -1,4 +1,4 @@ -/* +/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -6,9 +6,9 @@ * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -32,7 +32,6 @@ import org.apache.parquet.hadoop.util.HiddenFileFilter; import org.apache.parquet.schema.MessageType; import org.apache.parquet.tools.Main; -import org.apache.parquet.tools.util.MetadataUtils; import org.apache.parquet.tools.util.PrettyPrintWriter; import static org.apache.parquet.format.converter.ParquetMetadataConverter.NO_FILTER; @@ -49,7 +48,11 @@ public class ShowSchemaCommand extends ArgsOnlyCommand { Option help = OptionBuilder.withLongOpt("detailed") .withDescription("Show detailed information about the schema.") .create('d'); + Option originalType = OptionBuilder.withLongOpt("originalType") + .withDescription("Print logical types in OriginalType representation.") + .create('o'); OPTIONS.addOption(help); + OPTIONS.addOption(originalType); } public ShowSchemaCommand() { @@ -98,8 +101,9 @@ public void execute(CommandLine options) throws Exception { Main.out.println(schema); if (options.hasOption('d')) { + boolean showOriginalTypes = options.hasOption('o'); PrettyPrintWriter out = PrettyPrintWriter.stdoutPrettyPrinter().build(); - MetadataUtils.showDetails(out, metaData); + MetadataUtils.showDetails(out, metaData, showOriginalTypes); } } } diff --git a/parquet-tools/src/main/java/org/apache/parquet/tools/read/SimpleRecordConverter.java b/parquet-tools/src/main/java/org/apache/parquet/tools/read/SimpleRecordConverter.java index a119a347e7..7a1c81d6f8 100644 --- a/parquet-tools/src/main/java/org/apache/parquet/tools/read/SimpleRecordConverter.java +++ b/parquet-tools/src/main/java/org/apache/parquet/tools/read/SimpleRecordConverter.java @@ -1,4 +1,4 @@ -/* +/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -6,9 +6,9 @@ * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -20,15 +20,18 @@ import java.math.BigDecimal; import java.math.BigInteger; +import java.util.Optional; import org.apache.parquet.io.api.Binary; import org.apache.parquet.io.api.Converter; import org.apache.parquet.io.api.GroupConverter; import org.apache.parquet.io.api.PrimitiveConverter; import org.apache.parquet.schema.GroupType; -import org.apache.parquet.schema.OriginalType; +import org.apache.parquet.schema.LogicalTypeAnnotation; import org.apache.parquet.schema.Type; +import static java.util.Optional.of; + public class SimpleRecordConverter extends GroupConverter { private final Converter converters[]; private final String name; @@ -51,31 +54,38 @@ public SimpleRecordConverter(GroupType schema, String name, SimpleRecordConverte } private Converter createConverter(Type field) { - OriginalType otype = field.getOriginalType(); + LogicalTypeAnnotation ltype = field.getLogicalTypeAnnotation(); if (field.isPrimitive()) { - if (otype != null) { - switch (otype) { - case MAP: break; - case LIST: break; - case UTF8: return new StringConverter(field.getName()); - case MAP_KEY_VALUE: break; - case ENUM: break; - case DECIMAL: - int scale = field.asPrimitiveType().getDecimalMetadata().getScale(); - return new DecimalConverter(field.getName(), scale); - } + if (ltype != null) { + return ltype.accept(new LogicalTypeAnnotation.LogicalTypeAnnotationVisitor() { + @Override + public Optional visit(LogicalTypeAnnotation.StringLogicalTypeAnnotation stringLogicalType) { + return of(new StringConverter(field.getName())); + } + + @Override + public Optional visit(LogicalTypeAnnotation.DecimalLogicalTypeAnnotation decimalLogicalType) { + int scale = decimalLogicalType.getScale(); + return of(new DecimalConverter(field.getName(), scale)); + } + }).orElse(new SimplePrimitiveConverter(field.getName())); } - - return new SimplePrimitiveConverter(field.getName()); } GroupType groupType = field.asGroupType(); - if (otype != null) { - switch (otype) { - case MAP: return new SimpleMapRecordConverter(groupType, field.getName(), this); - case LIST: return new SimpleListRecordConverter(groupType, field.getName(), this); - } + if (ltype != null) { + return ltype.accept(new LogicalTypeAnnotation.LogicalTypeAnnotationVisitor() { + @Override + public Optional visit(LogicalTypeAnnotation.MapLogicalTypeAnnotation mapLogicalType) { + return of(new SimpleMapRecordConverter(groupType, field.getName(), SimpleRecordConverter.this)); + } + + @Override + public Optional visit(LogicalTypeAnnotation.ListLogicalTypeAnnotation listLogicalType) { + return of(new SimpleListRecordConverter(groupType, field.getName(), SimpleRecordConverter.this)); + } + }).orElse(new SimpleRecordConverter(groupType, field.getName(), this)); } return new SimpleRecordConverter(groupType, field.getName(), this); } @@ -162,6 +172,16 @@ public DecimalConverter(String name, int scale) { public void addBinary(Binary value) { record.add(name, new BigDecimal(new BigInteger(value.getBytes()), scale)); } + + @Override + public void addInt(int value) { + record.add(name, BigDecimal.valueOf(value).movePointLeft(scale)); + } + + @Override + public void addLong(long value) { + record.add(name, BigDecimal.valueOf(value).movePointLeft(scale)); + } } } diff --git a/parquet-tools/src/main/java/org/apache/parquet/tools/util/MetadataUtils.java b/parquet-tools/src/main/java/org/apache/parquet/tools/util/MetadataUtils.java index 870b8c18a0..206028a303 100644 --- a/parquet-tools/src/main/java/org/apache/parquet/tools/util/MetadataUtils.java +++ b/parquet-tools/src/main/java/org/apache/parquet/tools/util/MetadataUtils.java @@ -1,4 +1,4 @@ -/* +/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -6,9 +6,9 @@ * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -40,6 +40,7 @@ import org.apache.parquet.schema.Type; import org.apache.parquet.schema.Type.Repetition; +@Deprecated public class MetadataUtils { public static final double BAD_COMPRESSION_RATIO_CUTOFF = 0.97; public static final double GOOD_COMPRESSION_RATIO_CUTOFF = 1.2; @@ -163,7 +164,7 @@ public static void showDetails(PrettyPrintWriter out, ColumnDescriptor desc) { int defl = desc.getMaxDefinitionLevel(); int repl = desc.getMaxRepetitionLevel(); - out.format("column desc: %s T:%s R:%d D:%d%n", path, type, repl, defl); + out.format("column desc: %s T:%s R:%d D:%d%n", path, type, repl, defl); } public static void showDetails(PrettyPrintWriter out, MessageType type) { diff --git a/pom.xml b/pom.xml index 6e6902847b..a6ba552caa 100644 --- a/pom.xml +++ b/pom.xml @@ -84,6 +84,7 @@ 2.7.0-SNAPSHOT 1.7.0 thrift + thrift 2.10.6 2.10 @@ -92,6 +93,7 @@ h2 0.10.0 0.9.3 + 0.9.3 7.0.13 0.9.33 1.7.22 @@ -117,6 +119,7 @@ parquet-column parquet-common parquet-encoding + parquet-format-structures parquet-generator parquet-hadoop parquet-jackson @@ -175,6 +178,11 @@ + + + **/generated-sources/**/*.java + + org.codehaus.mojo @@ -213,13 +221,13 @@ - + org.apache.maven.plugins maven-resources-plugin 2.7 - + maven-enforcer-plugin 1.3.1 @@ -373,7 +381,7 @@ - + org.apache.maven.plugins maven-resources-plugin @@ -388,8 +396,8 @@ true - - + + org.apache.maven.plugins From b8a0f5c2b6008b7bfeef3b78dd9b4a3a73eb9913 Mon Sep 17 00:00:00 2001 From: Junjie Chen Date: Sun, 21 Oct 2018 22:52:44 +0800 Subject: [PATCH 4/9] Fix conflicts after rebase and merge --- .../parquet/column/impl/ColumnWriterV2.java | 2 + .../TestBlockSplitBloomFilter.java | 2 +- .../parquet/hadoop/ParquetFileWriter.java | 40 ------------------- 3 files changed, 3 insertions(+), 41 deletions(-) diff --git a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV2.java b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV2.java index f60c9b2da9..8e9e6f7fa2 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV2.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV2.java @@ -60,6 +60,8 @@ public BytesInput getBytes() { super(path, pageWriter, props); } + private static final ValuesWriter NULL_WRITER = new DevNullValuesWriter(); + ColumnWriterV2(ColumnDescriptor path, PageWriter pageWriter, BloomFilterWriter bloomFilterWriter, ParquetProperties props) { super(path, pageWriter, bloomFilterWriter, props); diff --git a/parquet-column/src/test/java/org/apache/parquet/column/values/bloomfilter/TestBlockSplitBloomFilter.java b/parquet-column/src/test/java/org/apache/parquet/column/values/bloomfilter/TestBlockSplitBloomFilter.java index 0f85195706..a76109f127 100644 --- a/parquet-column/src/test/java/org/apache/parquet/column/values/bloomfilter/TestBlockSplitBloomFilter.java +++ b/parquet-column/src/test/java/org/apache/parquet/column/values/bloomfilter/TestBlockSplitBloomFilter.java @@ -112,7 +112,7 @@ public void testFPP() throws IOException { exist ++; } } - + // The exist should be probably less than 1000 according FPP 0.01. assertTrue(exist < totalCount * FPP); } diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java index 6556c6ada0..0a13e543e4 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java @@ -490,46 +490,6 @@ public void writeDataPage( innerWriteDataPage(valueCount, uncompressedPageSize, bytes, statistics, rlEncoding, dlEncoding, valuesEncoding); } - /** - * Writes a single page - * @param valueCount count of values - * @param uncompressedPageSize the size of the data once uncompressed - * @param bytes the compressed data for the page without header - * @param statistics the statistics of the page - * @param rowCount the number of rows in the page - * @param rlEncoding encoding of the repetition level - * @param dlEncoding encoding of the definition level - * @param valuesEncoding encoding of values - * @throws IOException if any I/O error occurs during writing the file - */ - @Deprecated - public void writeDataPage( - int valueCount, int uncompressedPageSize, - BytesInput bytes, - Statistics statistics, - long rowCount, - Encoding rlEncoding, - Encoding dlEncoding, - Encoding valuesEncoding) throws IOException { - long beforeHeader = out.getPos(); - innerWriteDataPage(valueCount, uncompressedPageSize, bytes, statistics, rlEncoding, dlEncoding, valuesEncoding); - - offsetIndexBuilder.add((int) (out.getPos() - beforeHeader), rowCount); - } - - private void innerWriteDataPage( - int valueCount, int uncompressedPageSize, - BytesInput bytes, - Statistics statistics, - Encoding rlEncoding, - Encoding dlEncoding, - Encoding valuesEncoding) throws IOException { - // We are unable to build indexes without rowCount so skip them for this column - offsetIndexBuilder = OffsetIndexBuilder.getNoOpBuilder(); - columnIndexBuilder = ColumnIndexBuilder.getNoOpBuilder(); - innerWriteDataPage(valueCount, uncompressedPageSize, bytes, statistics, rlEncoding, dlEncoding, valuesEncoding); - } - /** * Writes a single page * @param valueCount count of values From 1b646a9261accadb6a5ab78307dccb3aface8aa2 Mon Sep 17 00:00:00 2001 From: Junjie Chen Date: Wed, 31 Oct 2018 01:30:54 +0800 Subject: [PATCH 5/9] address comments --- .../parquet/column/ParquetProperties.java | 2 + .../column/impl/ColumnWriteStoreBase.java | 1 + .../parquet/column/impl/ColumnWriterBase.java | 12 ++-- .../bloomfilter/BlockSplitBloomFilter.java | 70 +++++++++---------- .../values/bloomfilter/BloomFilter.java | 32 ++++----- .../TestBlockSplitBloomFilter.java | 14 ++-- .../parquet/hadoop/ParquetOutputFormat.java | 28 +++++++- .../parquet/hadoop/TestParquetFileWriter.java | 8 +-- 8 files changed, 94 insertions(+), 73 deletions(-) diff --git a/parquet-column/src/main/java/org/apache/parquet/column/ParquetProperties.java b/parquet-column/src/main/java/org/apache/parquet/column/ParquetProperties.java index 525af61021..1690b68e80 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/ParquetProperties.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/ParquetProperties.java @@ -90,6 +90,8 @@ public static WriterVersion fromString(String name) { private final ValuesWriterFactory valuesWriterFactory; private final int columnIndexTruncateLength; private final boolean enableBloomFilter; + + // The key-value pair represents the column name and its expected distinct number of values in a row group. private final HashMap bloomFilterExpectedDistinctNumbers; private ParquetProperties(WriterVersion writerVersion, int pageSize, int dictPageSize, boolean enableDict, int minRowCountForPageSizeCheck, diff --git a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriteStoreBase.java b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriteStoreBase.java index dc4946e4ff..a0658640e8 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriteStoreBase.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriteStoreBase.java @@ -107,6 +107,7 @@ public ColumnWriter getColumnWriter(ColumnDescriptor path) { }; } + // The Bloom filter is written to a specified bitset instead of pages. So it needs a separated write store abstract. ColumnWriteStoreBase( MessageType schema, PageWriteStore pageWriteStore, diff --git a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterBase.java b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterBase.java index af8e90ecc2..84a25e3757 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterBase.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterBase.java @@ -81,7 +81,7 @@ abstract class ColumnWriterBase implements ColumnWriter { ) { this(path, pageWriter, props); - // Current not support nested column. + // Bloom filters don't support nested columns yet; see PARQUET-1453. if (path.getPath().length != 1 || bloomFilterWriter == null) { return; } @@ -154,31 +154,31 @@ public long getBufferedSizeInMemory() { private void updateBloomFilter(int value) { if (bloomFilter != null) { - bloomFilter.insert(bloomFilter.hash(value)); + bloomFilter.insertHash(bloomFilter.hash(value)); } } private void updateBloomFilter(long value) { if (bloomFilter != null) { - bloomFilter.insert(bloomFilter.hash(value)); + bloomFilter.insertHash(bloomFilter.hash(value)); } } private void updateBloomFilter(double value) { if (bloomFilter != null) { - bloomFilter.insert(bloomFilter.hash(value)); + bloomFilter.insertHash(bloomFilter.hash(value)); } } private void updateBloomFilter(float value) { if (bloomFilter != null) { - bloomFilter.insert(bloomFilter.hash(value)); + bloomFilter.insertHash(bloomFilter.hash(value)); } } private void updateBloomFilter(Binary value) { if (bloomFilter != null) { - bloomFilter.insert(bloomFilter.hash(value)); + bloomFilter.insertHash(bloomFilter.hash(value)); } } diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BlockSplitBloomFilter.java b/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BlockSplitBloomFilter.java index f5ceadc428..18d1876aaf 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BlockSplitBloomFilter.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BlockSplitBloomFilter.java @@ -36,24 +36,25 @@ * each tiny Bloom filter. Each tiny Bloom filter is 32 bytes to take advantage of 32-byte SIMD * instruction. */ -public class BlockSplitBloomFilter extends BloomFilter { +public class BlockSplitBloomFilter implements BloomFilter { // Bytes in a tiny Bloom filter block. - private static final int BYTES_PER_FILTER_BLOCK = 32; + private static final int BYTES_PER_BLOCK = 32; - // Default seed for hash function, it comes from System.nanoTime(). + // Default seed for the hash function. It comes from System.nanoTime(). private static final int DEFAULT_SEED = 1361930890; - // Minimum Bloom filter size, set to size of a tiny Bloom filter block - public static final int MINIMUM_BLOOM_FILTER_BYTES = 32; + // Minimum Bloom filter size, set to the size of a tiny Bloom filter block + public static final int MINIMUM_BYTES = 32; - // Maximum Bloom filter size, it sets to default HDFS block size for upper boundary check + // Maximum Bloom filter size, set to the default HDFS block size for upper boundary check // This should be re-consider when implementing write side logic. - public static final int MAXIMUM_BLOOM_FILTER_BYTES = 128 * 1024 * 1024; + public static final int MAXIMUM_BYTES = 128 * 1024 * 1024; // The number of bits to set in a tiny Bloom filter private static final int BITS_SET_PER_BLOCK = 8; - // The header of Bloom filter, it includes number of bytes, algorithm and hash enumeration. + // The metadata in the header of a serialized Bloom filter is three four-byte values: the number of bytes, + // the filter algorithm, and the hash algorithm. public static final int HEADER_SIZE = 12; // The default false positive probability value @@ -62,9 +63,6 @@ public class BlockSplitBloomFilter extends BloomFilter { // Hash strategy used in this Bloom filter. public final HashStrategy hashStrategy; - // Algorithm used in this Bloom filter. - public final Algorithm algorithm; - // The underlying byte array for Bloom filter bitset. private byte[] bitset; @@ -74,18 +72,18 @@ public class BlockSplitBloomFilter extends BloomFilter { // Hash function use to compute hash for column value. private HashFunction hashFunction; - // The block-based algorithm needs 8 odd SALT values to calculate eight index - // of bit to set, one bit in 32-bit word. + // The block-based algorithm needs 8 odd SALT values to calculate eight indexes + // of bits to set, one per 32-bit word. private static final int SALT[] = {0x47b6137b, 0x44974d91, 0x8824ad5b, 0xa2b7289d, 0x705495c7, 0x2df1424b, 0x9efc4947, 0x5c6bfb31}; + /** * Constructor of Bloom filter. * * @param numBytes The number of bytes for Bloom filter bitset. The range of num_bytes should be within - * [MINIMUM_BLOOM_FILTER_BYTES, MAXIMUM_BLOOM_FILTER_BYTES], it will be rounded up/down - * to lower/upper bound if num_bytes is out of range and also will rounded up to a power - * of 2. It uses murmur3_x64_128 as its default hash function and block-based algorithm - * as default algorithm. + * [MINIMUM_BYTES, MAXIMUM_BYTES], it will be rounded up/down + * to lower/upper bound if num_bytes is out of range. It will also be rounded up to a power + * of 2. It uses murmur3_x64_128 as its default hash function. */ public BlockSplitBloomFilter(int numBytes) { this(numBytes, HashStrategy.MURMUR3_X64_128, Algorithm.BLOCK); @@ -107,9 +105,8 @@ private BlockSplitBloomFilter(int numBytes, HashStrategy hashStrategy, Algorithm hashFunction = Hashing.murmur3_128(DEFAULT_SEED); break; default: - throw new RuntimeException("Not supported hash strategy"); + throw new RuntimeException("Unsupported hash strategy"); } - this.algorithm = algorithm; } /** @@ -146,28 +143,27 @@ private BlockSplitBloomFilter(byte[] bitset, HashStrategy hashStrategy, Algorith default: throw new RuntimeException("Not supported hash strategy"); } - this.algorithm = algorithm; } /** * Create a new bitset for Bloom filter. * * @param numBytes The number of bytes for Bloom filter bitset. The range of num_bytes should be within - * [MINIMUM_BLOOM_FILTER_BYTES, MAXIMUM_BLOOM_FILTER_BYTES], it will be rounded up/down + * [MINIMUM_BYTES, MAXIMUM_BYTES], it will be rounded up/down * to lower/upper bound if num_bytes is out of range and also will rounded up to a power * of 2. It uses murmur3_x64_128 as its default hash function and block-based algorithm * as default algorithm. */ private void initBitset(int numBytes) { - if (numBytes < MINIMUM_BLOOM_FILTER_BYTES) { - numBytes = MINIMUM_BLOOM_FILTER_BYTES; + if (numBytes < MINIMUM_BYTES) { + numBytes = MINIMUM_BYTES; } // Get next power of 2 if it is not power of 2. if ((numBytes & (numBytes - 1)) != 0) { numBytes = Integer.highestOneBit(numBytes) << 1; } - if (numBytes > MAXIMUM_BLOOM_FILTER_BYTES || numBytes < 0) { - numBytes = MAXIMUM_BLOOM_FILTER_BYTES; + if (numBytes > MAXIMUM_BYTES || numBytes < 0) { + numBytes = MAXIMUM_BYTES; } this.bitset = new byte[numBytes]; this.intBuffer = ByteBuffer.wrap(bitset).order(ByteOrder.LITTLE_ENDIAN).asIntBuffer(); @@ -178,9 +174,9 @@ public void writeTo(OutputStream out) throws IOException { // Write number of bytes of bitset. out.write(BytesUtils.intToBytes(bitset.length)); // Write hash strategy - out.write(BytesUtils.intToBytes(this.hashStrategy.ordinal())); + out.write(BytesUtils.intToBytes(hashStrategy.ordinal())); // Write algorithm - out.write(BytesUtils.intToBytes(this.algorithm.ordinal())); + out.write(BytesUtils.intToBytes(Algorithm.BLOCK.ordinal())); // Write bitset out.write(bitset); } @@ -202,28 +198,28 @@ private int[] setMask(int key) { } @Override - public void insert(long hash) { - int bucketIndex = (int)(hash >> 32) & (bitset.length / BYTES_PER_FILTER_BLOCK - 1); + public void insertHash(long hash) { + int bucketIndex = (int)(hash >> 32) & (bitset.length / BYTES_PER_BLOCK - 1); int key = (int)hash; // Calculate mask for bucket. int mask[] = setMask(key); for (int i = 0; i < BITS_SET_PER_BLOCK; i++) { - int value = intBuffer.get(bucketIndex * (BYTES_PER_FILTER_BLOCK / 4) + i); + int value = intBuffer.get(bucketIndex * (BYTES_PER_BLOCK / 4) + i); value |= mask[i]; - intBuffer.put(bucketIndex * (BYTES_PER_FILTER_BLOCK / 4) + i, value); + intBuffer.put(bucketIndex * (BYTES_PER_BLOCK / 4) + i, value); } } @Override - public boolean find(long hash) { - int bucketIndex = (int)(hash >> 32) & (bitset.length / BYTES_PER_FILTER_BLOCK - 1); + public boolean findHash(long hash) { + int bucketIndex = (int)(hash >> 32) & (bitset.length / BYTES_PER_BLOCK - 1); int key = (int)hash; // Calculate mask for the tiny Bloom filter. int mask[] = setMask(key); for (int i = 0; i < BITS_SET_PER_BLOCK; i++) { - if (0 == (intBuffer.get(bucketIndex * (BYTES_PER_FILTER_BLOCK / 4) + i) & mask[i])) { + if (0 == (intBuffer.get(bucketIndex * (BYTES_PER_BLOCK / 4) + i) & mask[i])) { return false; } } @@ -242,7 +238,7 @@ public static int optimalNumOfBits(long n, double p) { Preconditions.checkArgument((p > 0.0 && p < 1.0), "FPP should be less than 1.0 and great than 0.0"); final double m = -8 * n / Math.log(1 - Math.pow(p, 1.0 / 8)); - final double MAX = MAXIMUM_BLOOM_FILTER_BYTES << 3; + final double MAX = MAXIMUM_BYTES << 3; int numBits = (int)m; // Handle overflow. @@ -253,8 +249,8 @@ public static int optimalNumOfBits(long n, double p) { if ((numBits & (numBits - 1)) != 0) { numBits = Integer.highestOneBit(numBits) << 1; } - if (numBits < (MINIMUM_BLOOM_FILTER_BYTES << 3)) { - numBits = MINIMUM_BLOOM_FILTER_BYTES << 3; + if (numBits < (MINIMUM_BYTES << 3)) { + numBits = MINIMUM_BYTES << 3; } return numBits; diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BloomFilter.java b/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BloomFilter.java index 4199497fd9..d02fa52398 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BloomFilter.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BloomFilter.java @@ -27,24 +27,24 @@ * in a set. The Bloom filter usually consists of a bit set that represents a elements set, * a hash strategy and a Bloom filter algorithm. */ -public abstract class BloomFilter { +public interface BloomFilter { // Bloom filter Hash strategy. - public enum HashStrategy { - MURMUR3_X64_128, + enum HashStrategy { + MURMUR3_X64_128 } // Bloom filter algorithm. - public enum Algorithm { - BLOCK, + enum Algorithm { + BLOCK } /** - * Write the Bloom filter to an output stream. It writes the Bloom filter header includes the - * bitset's length in size of byte, the hash strategy, the algorithm, and the bitset. + * Write the Bloom filter to an output stream. It writes the Bloom filter header including the + * bitset's length in bytes, the hash strategy, the algorithm, and the bitset. * * @param out the output stream to write */ - public abstract void writeTo(OutputStream out) throws IOException; + void writeTo(OutputStream out) throws IOException; /** * Insert an element to the Bloom filter, the element content is represented by @@ -52,7 +52,7 @@ public enum Algorithm { * * @param hash the hash result of element. */ - public abstract void insert(long hash); + void insertHash(long hash); /** * Determine whether an element is in set or not. @@ -60,7 +60,7 @@ public enum Algorithm { * @param hash the hash value of element plain encoding result. * @return false if element is must not in set, true if element probably in set. */ - public abstract boolean find(long hash); + boolean findHash(long hash); /** * Compute hash for int value by using its plain encoding result. @@ -68,7 +68,7 @@ public enum Algorithm { * @param value the value to hash * @return hash result */ - public abstract long hash(int value); + long hash(int value); /** * Compute hash for long value by using its plain encoding result. @@ -76,7 +76,7 @@ public enum Algorithm { * @param value the value to hash * @return hash result */ - public abstract long hash(long value) ; + long hash(long value) ; /** * Compute hash for double value by using its plain encoding result. @@ -84,7 +84,7 @@ public enum Algorithm { * @param value the value to hash * @return hash result */ - public abstract long hash(double value); + long hash(double value); /** * Compute hash for float value by using its plain encoding result. @@ -92,7 +92,7 @@ public enum Algorithm { * @param value the value to hash * @return hash result */ - public abstract long hash(float value); + long hash(float value); /** * Compute hash for Binary value by using its plain encoding result. @@ -100,12 +100,12 @@ public enum Algorithm { * @param value the value to hash * @return hash result */ - public abstract long hash(Binary value); + long hash(Binary value); /** * Get the number of bytes for bitset in this Bloom filter. * * @return The number of bytes for bitset in this Bloom filter. */ - public abstract long getBitsetSize(); + long getBitsetSize(); } diff --git a/parquet-column/src/test/java/org/apache/parquet/column/values/bloomfilter/TestBlockSplitBloomFilter.java b/parquet-column/src/test/java/org/apache/parquet/column/values/bloomfilter/TestBlockSplitBloomFilter.java index a76109f127..8dbb0ba193 100644 --- a/parquet-column/src/test/java/org/apache/parquet/column/values/bloomfilter/TestBlockSplitBloomFilter.java +++ b/parquet-column/src/test/java/org/apache/parquet/column/values/bloomfilter/TestBlockSplitBloomFilter.java @@ -39,9 +39,9 @@ public class TestBlockSplitBloomFilter { @Test public void testConstructor () throws IOException { BloomFilter bloomFilter1 = new BlockSplitBloomFilter(0); - assertEquals(bloomFilter1.getBitsetSize(), BlockSplitBloomFilter.MINIMUM_BLOOM_FILTER_BYTES); - BloomFilter bloomFilter2 = new BlockSplitBloomFilter(256 * 1024 * 1024); - assertEquals(bloomFilter2.getBitsetSize(), BlockSplitBloomFilter.MAXIMUM_BLOOM_FILTER_BYTES); + assertEquals(bloomFilter1.getBitsetSize(), BlockSplitBloomFilter.MINIMUM_BYTES); + BloomFilter bloomFilter2 = new BlockSplitBloomFilter(BlockSplitBloomFilter.MAXIMUM_BYTES + 1); + assertEquals(bloomFilter2.getBitsetSize(), BlockSplitBloomFilter.MAXIMUM_BYTES); BloomFilter bloomFilter3 = new BlockSplitBloomFilter(1000); assertEquals(bloomFilter3.getBitsetSize(), 1024); } @@ -59,7 +59,7 @@ public void testBasic () throws IOException { BloomFilter bloomFilter = new BlockSplitBloomFilter(1024); for(int i = 0; i < testStrings.length; i++) { - bloomFilter.insert(bloomFilter.hash(Binary.fromString(testStrings[i]))); + bloomFilter.insertHash(bloomFilter.hash(Binary.fromString(testStrings[i]))); } File testFile = temp.newFile(); @@ -85,7 +85,7 @@ public void testBasic () throws IOException { fileInputStream.read(bitset); bloomFilter = new BlockSplitBloomFilter(bitset); for(int i = 0; i < testStrings.length; i++) { - assertTrue(bloomFilter.find(bloomFilter.hash(Binary.fromString(testStrings[i])))); + assertTrue(bloomFilter.findHash(bloomFilter.hash(Binary.fromString(testStrings[i])))); } } @@ -101,14 +101,14 @@ public void testFPP() throws IOException { for(int i = 0; i < totalCount; i++) { String str = randomStr.get(10); strings.add(str); - bloomFilter.insert(bloomFilter.hash(Binary.fromString(str))); + bloomFilter.insertHash(bloomFilter.hash(Binary.fromString(str))); } // The exist counts the number of times FindHash returns true. int exist = 0; for (int i = 0; i < totalCount; i++) { String str = randomStr.get(8); - if (bloomFilter.find(bloomFilter.hash(Binary.fromString(str)))) { + if (bloomFilter.findHash(bloomFilter.hash(Binary.fromString(str)))) { exist ++; } } diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetOutputFormat.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetOutputFormat.java index 0789bf50d4..355c46b749 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetOutputFormat.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetOutputFormat.java @@ -1,4 +1,4 @@ -/* +/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -6,9 +6,9 @@ * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -144,6 +144,9 @@ public static enum JobSummaryLevel { public static final String MAX_ROW_COUNT_FOR_PAGE_SIZE_CHECK = "parquet.page.size.row.check.max"; public static final String ESTIMATE_PAGE_SIZE_CHECK = "parquet.page.size.check.estimate"; public static final String COLUMN_INDEX_TRUNCATE_LENGTH = "parquet.columnindex.truncate.length"; + public static final String BLOOM_FILTER_COLUMN_NAMES = "parquet.bloom.filter.column.names"; + public static final String BLOOM_FILTER_EXPECTED_NDV = "parquet.bloom.filter.expected.ndv"; + public static final String ENABLE_BLOOM_FILTER = "parquet.enable.bloom.filter"; public static JobSummaryLevel getJobSummaryLevel(Configuration conf) { String level = conf.get(JOB_SUMMARY_LEVEL); @@ -209,6 +212,19 @@ public static boolean getEnableDictionary(JobContext jobContext) { return getEnableDictionary(getConfiguration(jobContext)); } + public static String getBloomFilterColumnNames(Configuration conf) { + return conf.get(BLOOM_FILTER_COLUMN_NAMES); + } + + public static String getBloomFilterExpectedNDV(Configuration configuration) { + return configuration.get(BLOOM_FILTER_EXPECTED_NDV); + } + + public static boolean getEnableBloomFilter(Configuration configuration) { + return configuration.getBoolean(ENABLE_BLOOM_FILTER, + ParquetProperties.DEFAULT_BLOOM_FILTER_ENABLED); + } + public static int getBlockSize(JobContext jobContext) { return getBlockSize(getConfiguration(jobContext)); } @@ -375,6 +391,8 @@ public RecordWriter getRecordWriter(Configuration conf, Path file, Comp .withPageSize(getPageSize(conf)) .withDictionaryPageSize(getDictionaryPageSize(conf)) .withDictionaryEncoding(getEnableDictionary(conf)) + .withBloomFilterEnabled(getEnableBloomFilter(conf)) + .withBloomFilterInfo(getBloomFilterColumnNames(conf), getBloomFilterExpectedNDV(conf)) .withWriterVersion(getWriterVersion(conf)) .estimateRowCountForPageSizeCheck(getEstimatePageSizeCheck(conf)) .withMinRowCountForPageSizeCheck(getMinRowCountForPageSizeCheck(conf)) @@ -398,6 +416,10 @@ public RecordWriter getRecordWriter(Configuration conf, Path file, Comp LOG.info("Min row count for page size check is: {}", props.getMinRowCountForPageSizeCheck()); LOG.info("Max row count for page size check is: {}", props.getMaxRowCountForPageSizeCheck()); LOG.info("Truncate length for column indexes is: {}", props.getColumnIndexTruncateLength()); + LOG.info("Bloom Filter is {}", props.isBloomFilterEnabled()? "on": "off"); + LOG.info("Bloom filter enabled column names are: {}", props.getBloomFilterExpectedDistinctNumbers().keySet()); + LOG.info("Bloom filter enabled column expected number of distinct values are: {}", + props.getBloomFilterExpectedDistinctNumbers().values()); } WriteContext init = writeSupport.init(conf); diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java index e4a1d350cc..0cfb001d49 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java @@ -240,8 +240,8 @@ public void testBloomWriteRead() throws Exception { w.writeDataPage(2, 4, BytesInput.from(BYTES1),stats1, BIT_PACKED, BIT_PACKED, PLAIN); w.writeDataPage(3, 4, BytesInput.from(BYTES1),stats1, BIT_PACKED, BIT_PACKED, PLAIN); BloomFilter bloomData = new BlockSplitBloomFilter(0); - bloomData.insert(bloomData.hash(Binary.fromString("hello"))); - bloomData.insert(bloomData.hash(Binary.fromString("world"))); + bloomData.insertHash(bloomData.hash(Binary.fromString("hello"))); + bloomData.insertHash(bloomData.hash(Binary.fromString("world"))); long blStarts = w.getPos(); w.writeBloomFilter(bloomData); w.endColumn(); @@ -254,8 +254,8 @@ public void testBloomWriteRead() throws Exception { Arrays.asList(readFooter.getBlocks().get(0)), Arrays.asList(schema.getColumnDescription(colPath))); BloomFilterReader bloomFilterReader = r.getBloomFilterDataReader(readFooter.getBlocks().get(0)); BloomFilter bloomDataRead = bloomFilterReader.readBloomFilter(col); - assertTrue(bloomDataRead.find(bloomData.hash(Binary.fromString("hello")))); - assertTrue(bloomDataRead.find(bloomData.hash(Binary.fromString("world")))); + assertTrue(bloomDataRead.findHash(bloomData.hash(Binary.fromString("hello")))); + assertTrue(bloomDataRead.findHash(bloomData.hash(Binary.fromString("world")))); } @Test From f03d875322716c2dca34187affc3d2b068df8055 Mon Sep 17 00:00:00 2001 From: Junjie Chen Date: Thu, 1 Nov 2018 00:02:56 +0800 Subject: [PATCH 6/9] address comments and fix enum issue --- .../parquet/column/ParquetProperties.java | 21 ++++++---------- .../column/impl/ColumnWriteStoreBase.java | 4 +-- .../parquet/column/impl/ColumnWriterBase.java | 6 ++--- .../bloomfilter/BlockSplitBloomFilter.java | 20 +++++++-------- .../values/bloomfilter/BloomFilter.java | 12 +++++++-- .../parquet/hadoop/ParquetOutputFormat.java | 25 +++++++++++++------ 6 files changed, 48 insertions(+), 40 deletions(-) diff --git a/parquet-column/src/main/java/org/apache/parquet/column/ParquetProperties.java b/parquet-column/src/main/java/org/apache/parquet/column/ParquetProperties.java index 1690b68e80..65cd4c0afc 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/ParquetProperties.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/ParquetProperties.java @@ -211,7 +211,7 @@ public boolean isBloomFilterEnabled() { return enableBloomFilter; } - public HashMap getBloomFilterExpectedDistinctNumbers() { + public HashMap getBloomFilterColumnExpectedNDVs() { return bloomFilterExpectedDistinctNumbers; } @@ -235,7 +235,7 @@ public static class Builder { private ValuesWriterFactory valuesWriterFactory = DEFAULT_VALUES_WRITER_FACTORY; private int columnIndexTruncateLength = DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH; private boolean enableBloomFilter = DEFAULT_BLOOM_FILTER_ENABLED; - private HashMap bloomFilterExpectedDistinctNumbers = new HashMap<>(); + private HashMap bloomFilterColumnExpectedNDVs = new HashMap<>(); private Builder() { } @@ -249,7 +249,7 @@ private Builder(ParquetProperties toCopy) { this.estimateNextSizeCheck = toCopy.estimateNextSizeCheck; this.allocator = toCopy.allocator; this.enableBloomFilter = toCopy.enableBloomFilter; - this.bloomFilterExpectedDistinctNumbers = toCopy.bloomFilterExpectedDistinctNumbers; + this.bloomFilterColumnExpectedNDVs = toCopy.bloomFilterExpectedDistinctNumbers; } /** @@ -351,18 +351,11 @@ public Builder withBloomFilterEnabled(boolean enableBloomFilter) { /** * Set Bloom filter info for columns. * - * @param bloomFilterColumnNames the columns to be enabled for Bloom filter - * @param bloomFilterDistinctNumbers the expected distinct number of values corresponding to columns + * @param columnExpectedNDVs the columns expected number of distinct values in a row group * @return this builder for method chaining */ - public Builder withBloomFilterInfo(String bloomFilterColumnNames, String bloomFilterDistinctNumbers) { - String[] columnNames = bloomFilterColumnNames.split(","); - String[] expectedDistinctNumber = bloomFilterDistinctNumbers.split(","); - Preconditions.checkArgument(columnNames.length == expectedDistinctNumber.length, - "Column names are not matched to sizes"); - for (int i = 0; i < columnNames.length; i++) { - this.bloomFilterExpectedDistinctNumbers.put(columnNames[i], Long.getLong(expectedDistinctNumber[i])); - } + public Builder withBloomFilterInfo(HashMap columnExpectedNDVs) { + this.bloomFilterColumnExpectedNDVs = columnExpectedNDVs; return this; } @@ -371,7 +364,7 @@ public ParquetProperties build() { new ParquetProperties(writerVersion, pageSize, dictPageSize, enableDict, minRowCountForPageSizeCheck, maxRowCountForPageSizeCheck, estimateNextSizeCheck, allocator, valuesWriterFactory, columnIndexTruncateLength, - enableBloomFilter, bloomFilterExpectedDistinctNumbers); + enableBloomFilter, bloomFilterColumnExpectedNDVs); // we pass a constructed but uninitialized factory to ParquetProperties above as currently // creation of ValuesWriters is invoked from within ParquetProperties. In the future // we'd like to decouple that and won't need to pass an object to properties and then pass the diff --git a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriteStoreBase.java b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriteStoreBase.java index a0658640e8..744c24de78 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriteStoreBase.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriteStoreBase.java @@ -107,7 +107,7 @@ public ColumnWriter getColumnWriter(ColumnDescriptor path) { }; } - // The Bloom filter is written to a specified bitset instead of pages. So it needs a separated write store abstract. + // The Bloom filter is written to a specified bitset instead of pages, so it needs a separate write store abstract. ColumnWriteStoreBase( MessageType schema, PageWriteStore pageWriteStore, @@ -118,7 +118,7 @@ public ColumnWriter getColumnWriter(ColumnDescriptor path) { Map mcolumns = new TreeMap<>(); for (ColumnDescriptor path : schema.getColumns()) { PageWriter pageWriter = pageWriteStore.getPageWriter(path); - if (props.isBloomFilterEnabled() && props.getBloomFilterExpectedDistinctNumbers() != null) { + if (props.isBloomFilterEnabled() && props.getBloomFilterColumnExpectedNDVs() != null) { BloomFilterWriter bloomFilterWriter = bloomFilterWriteStore.getBloomFilterWriter(path); mcolumns.put(path, createColumnWriter(path, pageWriter, bloomFilterWriter, props)); } else { diff --git a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterBase.java b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterBase.java index 84a25e3757..c03b04fc5e 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterBase.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterBase.java @@ -87,10 +87,10 @@ abstract class ColumnWriterBase implements ColumnWriter { } this.bloomFilterWriter = bloomFilterWriter; - HashMap bloomFilterExpectValues = props.getBloomFilterExpectedDistinctNumbers(); + HashMap bloomFilterColumnExpectedNDVs = props.getBloomFilterColumnExpectedNDVs(); String column = path.getPath()[0]; - if (bloomFilterExpectValues.keySet().contains(column)) { - int optimalNumOfBits = BlockSplitBloomFilter.optimalNumOfBits(bloomFilterExpectValues.get(column).intValue(), + if (bloomFilterColumnExpectedNDVs.keySet().contains(column)) { + int optimalNumOfBits = BlockSplitBloomFilter.optimalNumOfBits(bloomFilterColumnExpectedNDVs.get(column).intValue(), BlockSplitBloomFilter.DEFAULT_FPP); this.bloomFilter = new BlockSplitBloomFilter(optimalNumOfBits/8); } diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BlockSplitBloomFilter.java b/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BlockSplitBloomFilter.java index 18d1876aaf..b6378976c3 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BlockSplitBloomFilter.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BlockSplitBloomFilter.java @@ -86,18 +86,17 @@ public class BlockSplitBloomFilter implements BloomFilter { * of 2. It uses murmur3_x64_128 as its default hash function. */ public BlockSplitBloomFilter(int numBytes) { - this(numBytes, HashStrategy.MURMUR3_X64_128, Algorithm.BLOCK); + this(numBytes, HashStrategy.MURMUR3_X64_128); } /** - * Constructor of Bloom filter. It uses murmur3_x64_128 as its default hash - * function and block-based algorithm as its default algorithm. + * Constructor of block-based Bloom filter. It uses murmur3_x64_128 as its default hash + * function. * * @param numBytes The number of bytes for Bloom filter bitset * @param hashStrategy The hash strategy of Bloom filter. - * @param algorithm The algorithm of Bloom filter. */ - private BlockSplitBloomFilter(int numBytes, HashStrategy hashStrategy, Algorithm algorithm) { + private BlockSplitBloomFilter(int numBytes, HashStrategy hashStrategy) { initBitset(numBytes); switch (hashStrategy) { case MURMUR3_X64_128: @@ -112,12 +111,12 @@ private BlockSplitBloomFilter(int numBytes, HashStrategy hashStrategy, Algorithm /** * Construct the Bloom filter with given bitset, it is used when reconstructing * Bloom filter from parquet file. It use murmur3_x64_128 as its default hash - * function and block-based algorithm as default algorithm. + * function. * * @param bitset The given bitset to construct Bloom filter. */ public BlockSplitBloomFilter(byte[] bitset) { - this(bitset, HashStrategy.MURMUR3_X64_128, Algorithm.BLOCK); + this(bitset, HashStrategy.MURMUR3_X64_128); } /** @@ -126,9 +125,8 @@ public BlockSplitBloomFilter(byte[] bitset) { * * @param bitset The given bitset to construct Bloom filter. * @param hashStrategy The hash strategy Bloom filter apply. - * @param algorithm The algorithm of Bloom filter. */ - private BlockSplitBloomFilter(byte[] bitset, HashStrategy hashStrategy, Algorithm algorithm) { + private BlockSplitBloomFilter(byte[] bitset, HashStrategy hashStrategy) { if (bitset == null) { throw new RuntimeException("Given bitset is null"); } @@ -174,9 +172,9 @@ public void writeTo(OutputStream out) throws IOException { // Write number of bytes of bitset. out.write(BytesUtils.intToBytes(bitset.length)); // Write hash strategy - out.write(BytesUtils.intToBytes(hashStrategy.ordinal())); + out.write(BytesUtils.intToBytes(hashStrategy.value)); // Write algorithm - out.write(BytesUtils.intToBytes(Algorithm.BLOCK.ordinal())); + out.write(BytesUtils.intToBytes(Algorithm.BLOCK.value)); // Write bitset out.write(bitset); } diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BloomFilter.java b/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BloomFilter.java index d02fa52398..3ec192e3e0 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BloomFilter.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BloomFilter.java @@ -30,12 +30,20 @@ public interface BloomFilter { // Bloom filter Hash strategy. enum HashStrategy { - MURMUR3_X64_128 + MURMUR3_X64_128(0); + HashStrategy(int value) { + this.value = value; + } + int value; } // Bloom filter algorithm. enum Algorithm { - BLOCK + BLOCK(0); + Algorithm(int value) { + this.value = value; + } + int value; } /** diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetOutputFormat.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetOutputFormat.java index 355c46b749..d716201d47 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetOutputFormat.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetOutputFormat.java @@ -23,6 +23,7 @@ import static org.apache.parquet.hadoop.util.ContextUtil.getConfiguration; import java.io.IOException; +import java.util.HashMap; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; @@ -212,12 +213,20 @@ public static boolean getEnableDictionary(JobContext jobContext) { return getEnableDictionary(getConfiguration(jobContext)); } - public static String getBloomFilterColumnNames(Configuration conf) { - return conf.get(BLOOM_FILTER_COLUMN_NAMES); - } + public static HashMap getBloomFilterColumnExpectedNDVs(Configuration conf) { + HashMap kv = new HashMap<>(); + String[] columnNames = conf.get(BLOOM_FILTER_COLUMN_NAMES).split(","); + String[] expectedNDVs = conf.get(BLOOM_FILTER_EXPECTED_NDV).split(","); + + if (columnNames.length == expectedNDVs.length) { + for (int i = 0; i < columnNames.length; i++) { + kv.put(columnNames[i], Long.getLong(expectedNDVs[i])); + } + } else { + LOG.warn("Bloom filter column names are not match expected NDVs"); + } - public static String getBloomFilterExpectedNDV(Configuration configuration) { - return configuration.get(BLOOM_FILTER_EXPECTED_NDV); + return kv; } public static boolean getEnableBloomFilter(Configuration configuration) { @@ -392,7 +401,7 @@ public RecordWriter getRecordWriter(Configuration conf, Path file, Comp .withDictionaryPageSize(getDictionaryPageSize(conf)) .withDictionaryEncoding(getEnableDictionary(conf)) .withBloomFilterEnabled(getEnableBloomFilter(conf)) - .withBloomFilterInfo(getBloomFilterColumnNames(conf), getBloomFilterExpectedNDV(conf)) + .withBloomFilterInfo(getBloomFilterColumnExpectedNDVs(conf)) .withWriterVersion(getWriterVersion(conf)) .estimateRowCountForPageSizeCheck(getEstimatePageSizeCheck(conf)) .withMinRowCountForPageSizeCheck(getMinRowCountForPageSizeCheck(conf)) @@ -417,9 +426,9 @@ public RecordWriter getRecordWriter(Configuration conf, Path file, Comp LOG.info("Max row count for page size check is: {}", props.getMaxRowCountForPageSizeCheck()); LOG.info("Truncate length for column indexes is: {}", props.getColumnIndexTruncateLength()); LOG.info("Bloom Filter is {}", props.isBloomFilterEnabled()? "on": "off"); - LOG.info("Bloom filter enabled column names are: {}", props.getBloomFilterExpectedDistinctNumbers().keySet()); + LOG.info("Bloom filter enabled column names are: {}", props.getBloomFilterColumnExpectedNDVs().keySet()); LOG.info("Bloom filter enabled column expected number of distinct values are: {}", - props.getBloomFilterExpectedDistinctNumbers().values()); + props.getBloomFilterColumnExpectedNDVs().values()); } WriteContext init = writeSupport.init(conf); From 5e4647fddfacb50e278757f13f5496f60e24549b Mon Sep 17 00:00:00 2001 From: "Chen, Junjie" Date: Mon, 24 Dec 2018 16:16:06 +0800 Subject: [PATCH 7/9] Fix build issue caused by merge --- .../org/apache/parquet/column/ParquetProperties.java | 9 +++------ .../apache/parquet/hadoop/ParquetOutputFormat.java | 11 +++++++++-- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/parquet-column/src/main/java/org/apache/parquet/column/ParquetProperties.java b/parquet-column/src/main/java/org/apache/parquet/column/ParquetProperties.java index 078d74eec2..4df5b71260 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/ParquetProperties.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/ParquetProperties.java @@ -98,10 +98,8 @@ public static WriterVersion fromString(String name) { private ParquetProperties(WriterVersion writerVersion, int pageSize, int dictPageSize, boolean enableDict, int minRowCountForPageSizeCheck, int maxRowCountForPageSizeCheck, boolean estimateNextSizeCheck, ByteBufferAllocator allocator, - ValuesWriterFactory writerFactory, int columnIndexMinMaxTruncateLength, boolean enableBloomFilter, - HashMap bloomFilterExpectedDistinctNumber) { - - ValuesWriterFactory writerFactory, int columnIndexMinMaxTruncateLength, int pageRowCountLimit) { + ValuesWriterFactory writerFactory, int columnIndexMinMaxTruncateLength, int pageRowCountLimit, + boolean enableBloomFilter, HashMap bloomFilterExpectedDistinctNumber) { this.pageSizeThreshold = pageSize; this.initialSlabSize = CapacityByteArrayOutputStream .initialSlabSizeHeuristic(MIN_SLAB_SIZE, pageSizeThreshold, 10); @@ -381,8 +379,7 @@ public ParquetProperties build() { ParquetProperties properties = new ParquetProperties(writerVersion, pageSize, dictPageSize, enableDict, minRowCountForPageSizeCheck, maxRowCountForPageSizeCheck, - estimateNextSizeCheck, allocator, valuesWriterFactory, columnIndexTruncateLength, pageRowCountLimit); - estimateNextSizeCheck, allocator, valuesWriterFactory, columnIndexTruncateLength, + estimateNextSizeCheck, allocator, valuesWriterFactory, columnIndexTruncateLength, pageRowCountLimit, enableBloomFilter, bloomFilterColumnExpectedNDVs); // we pass a constructed but uninitialized factory to ParquetProperties above as currently // creation of ValuesWriters is invoked from within ParquetProperties. In the future diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetOutputFormat.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetOutputFormat.java index a9aa97856b..33c3715378 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetOutputFormat.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetOutputFormat.java @@ -216,8 +216,15 @@ public static boolean getEnableDictionary(JobContext jobContext) { public static HashMap getBloomFilterColumnExpectedNDVs(Configuration conf) { HashMap kv = new HashMap<>(); - String[] columnNames = conf.get(BLOOM_FILTER_COLUMN_NAMES).split(","); - String[] expectedNDVs = conf.get(BLOOM_FILTER_EXPECTED_NDV).split(","); + String columnNamesConf = conf.get(BLOOM_FILTER_COLUMN_NAMES); + String expectedNDVsConf = conf.get(BLOOM_FILTER_EXPECTED_NDV); + + if (columnNamesConf == null || expectedNDVsConf == null) { + return kv; + } + + String[] columnNames = columnNamesConf.split(","); + String[] expectedNDVs = expectedNDVsConf.split(","); if (columnNames.length == expectedNDVs.length) { for (int i = 0; i < columnNames.length; i++) { From 894040d47a08aedb042f5c2fd125116826856313 Mon Sep 17 00:00:00 2001 From: Junjie Chen Date: Tue, 8 Jan 2019 07:26:12 +0800 Subject: [PATCH 8/9] test build --- .travis.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.travis.yml b/.travis.yml index e4e623f03b..17d7ee7dbb 100644 --- a/.travis.yml +++ b/.travis.yml @@ -29,6 +29,10 @@ before_install: - sudo make install - cd .. - date + - git clone https://github.com/apache/parquet-format.git + - cd parquet-format + - mvn install -DskipTests + - cd .. env: - HADOOP_PROFILE=default TEST_CODECS=uncompressed,brotli From fb0ab5c4d2212cd036906ca1a451ede37d6eb36a Mon Sep 17 00:00:00 2001 From: Junjie Chen Date: Thu, 10 Jan 2019 23:31:57 +0800 Subject: [PATCH 9/9] update check for Bloom filter reader --- .../apache/parquet/hadoop/ParquetFileReader.java | 14 +++++++++++++- .../hadoop/metadata/ColumnChunkMetaData.java | 4 ++-- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java index 6f03dd555e..7fe0e410ee 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java @@ -1062,16 +1062,28 @@ public BloomFilterDataReader getBloomFilterDataReader(BlockMetaData block) { */ public BloomFilter readBloomFilter(ColumnChunkMetaData meta) throws IOException { long bloomFilterOffset = meta.getBloomFilterOffset(); - if (bloomFilterOffset == Long.MAX_VALUE) return null; f.seek(bloomFilterOffset); + // Read Bloom filter data header. byte[] bytes = new byte[BlockSplitBloomFilter.HEADER_SIZE]; f.read(bytes); ByteBuffer bloomHeader = ByteBuffer.wrap(bytes); IntBuffer headerBuffer = bloomHeader.order(ByteOrder.LITTLE_ENDIAN).asIntBuffer(); int numBytes = headerBuffer.get(); + if (numBytes <= 0 || numBytes > BlockSplitBloomFilter.MAXIMUM_BYTES) { + return null; + } + BloomFilter.HashStrategy hash = BloomFilter.HashStrategy.values()[headerBuffer.get()]; + if (hash != BlockSplitBloomFilter.HashStrategy.MURMUR3_X64_128) { + return null; + } + BloomFilter.Algorithm algorithm = BloomFilter.Algorithm.values()[headerBuffer.get()]; + if (algorithm != BlockSplitBloomFilter.Algorithm.BLOCK) { + return null; + } + byte[] bitset = new byte[numBytes]; f.readFully(bitset); return new BlockSplitBloomFilter(bitset); diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ColumnChunkMetaData.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ColumnChunkMetaData.java index c55225c176..3156132534 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ColumnChunkMetaData.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ColumnChunkMetaData.java @@ -126,7 +126,7 @@ && positiveLongFitsInAnInt(totalUncompressedSize)) { statistics, firstDataPage, dictionaryPageOffset, - Long.MAX_VALUE, + 0, valueCount, totalSize, totalUncompressedSize); @@ -137,7 +137,7 @@ && positiveLongFitsInAnInt(totalUncompressedSize)) { statistics, firstDataPage, dictionaryPageOffset, - Long.MAX_VALUE, + 0, valueCount, totalSize, totalUncompressedSize);