diff --git a/parquet-column/src/main/java/org/apache/parquet/io/api/Binary.java b/parquet-column/src/main/java/org/apache/parquet/io/api/Binary.java index e37ee12483..173581bddf 100644 --- a/parquet-column/src/main/java/org/apache/parquet/io/api/Binary.java +++ b/parquet-column/src/main/java/org/apache/parquet/io/api/Binary.java @@ -27,7 +27,6 @@ import java.nio.ByteOrder; import java.nio.CharBuffer; import java.nio.charset.CharacterCodingException; -import java.nio.charset.CharsetEncoder; import java.nio.charset.StandardCharsets; import java.util.Arrays; import org.apache.parquet.io.ParquetEncodingException; @@ -268,14 +267,16 @@ public String toString() { return "Binary{\"" + toStringUsingUTF8() + "\"}"; } - private static final ThreadLocal ENCODER = - ThreadLocal.withInitial(StandardCharsets.UTF_8::newEncoder); - private static ByteBuffer encodeUTF8(CharSequence value) { try { - return ENCODER.get().encode(CharBuffer.wrap(value)); + // Use a fresh encoder per call rather than a static ThreadLocal initialized with a lambda + // (UTF_8::newEncoder): that lambda's class is loaded by the application ClassLoader and can + // keep it from being unloaded in long-lived pooled threads, leaking Metaspace (GH-3398). + // The encoder also preserves strict CodingErrorAction.REPORT, so malformed UTF-16 fails + // fast instead of being silently replaced (as String#getBytes(UTF_8) would). + return StandardCharsets.UTF_8.newEncoder().encode(CharBuffer.wrap(value)); } catch (CharacterCodingException e) { - throw new ParquetEncodingException("UTF-8 not supported.", e); + throw new ParquetEncodingException("Failed to encode CharSequence as UTF-8.", e); } } } diff --git a/parquet-column/src/test/java/org/apache/parquet/io/api/TestBinary.java b/parquet-column/src/test/java/org/apache/parquet/io/api/TestBinary.java index a1a83af771..3dcb878d2c 100644 --- a/parquet-column/src/test/java/org/apache/parquet/io/api/TestBinary.java +++ b/parquet-column/src/test/java/org/apache/parquet/io/api/TestBinary.java @@ -30,7 +30,10 @@ import java.io.ObjectInputStream; import java.io.ObjectOutputStream; import java.nio.ByteBuffer; +import java.nio.charset.CharacterCodingException; +import java.nio.charset.StandardCharsets; import java.util.Arrays; +import org.apache.parquet.io.ParquetEncodingException; import org.apache.parquet.io.api.TestBinary.BinaryFactory.BinaryAndOriginal; import org.junit.Test; @@ -314,4 +317,36 @@ public void testGet2BytesLittleEndianWrongLength() { // expected } } + + @Test + public void testFromCharSequenceEncodesValidUtf8() { + // Cover ASCII, multi-byte BMP, a supplementary code point (valid surrogate pair) and empty. + assertFromCharSequenceEncodesUtf8("test-123-é中"); // ASCII + U+00E9 (2-byte) + U+4E2D (3-byte) + assertFromCharSequenceEncodesUtf8("😀"); // U+1F600, valid surrogate pair (4-byte) + assertFromCharSequenceEncodesUtf8(""); // empty + } + + private static void assertFromCharSequenceEncodesUtf8(String value) { + // fromCharSequence routes any CharSequence (here a StringBuilder) through FromCharSequenceBinary. + // For valid input the strict encoder must match String#getBytes(UTF_8), so this is a genuine + // cross-check, not a circular assertion. + Binary binary = Binary.fromCharSequence(new StringBuilder(value)); + assertArrayEquals(value.getBytes(StandardCharsets.UTF_8), binary.getBytes()); + } + + @Test + public void testFromCharSequenceRejectsMalformedUtf16() { + // An unpaired high surrogate is invalid UTF-16. FromCharSequenceBinary must fail fast + // rather than silently substituting a replacement byte (as String#getBytes(UTF_8) would). + CharSequence value = new StringBuilder().append('a').append('\uD800').append('b'); + try { + Binary.fromCharSequence(value); + fail("Should have thrown an exception for malformed UTF-16 input"); + } catch (ParquetEncodingException e) { + // Lock in that the cause is a UTF-8 coding error, not an unrelated failure of the same type. + assertTrue( + "expected a CharacterCodingException cause but was " + e.getCause(), + e.getCause() instanceof CharacterCodingException); + } + } }