From 5f4ff98fb902660af63788234ebe1e22314cd69f Mon Sep 17 00:00:00 2001 From: Rahul Goel Date: Mon, 15 Jun 2026 20:35:45 -0400 Subject: [PATCH 1/3] fix: cast to unsigned char in ASCII case conversion to avoid undefined behavior --- src/iceberg/test/string_util_test.cc | 22 ++++++++++++++++++++++ src/iceberg/util/string_util.h | 22 +++++++++++++++++++--- 2 files changed, 41 insertions(+), 3 deletions(-) diff --git a/src/iceberg/test/string_util_test.cc b/src/iceberg/test/string_util_test.cc index 402588114..85fd35b43 100644 --- a/src/iceberg/test/string_util_test.cc +++ b/src/iceberg/test/string_util_test.cc @@ -41,4 +41,26 @@ TEST(StringUtilsTest, ToUpper) { ASSERT_EQ(StringUtils::ToUpper("123"), "123"); } +// Non-ASCII (multibyte UTF-8) bytes have the high bit set, i.e. are negative +// when stored in a signed char. Passing them straight to std::tolower/toupper is +// undefined behavior. Conversion only touches ASCII; multibyte bytes pass through +// unchanged. See https://github.com/apache/iceberg-cpp/issues/613. +TEST(StringUtilsTest, NonAsciiPassThrough) { + ASSERT_EQ(StringUtils::ToLower("Naïve"), "naïve"); + ASSERT_EQ(StringUtils::ToUpper("café"), "CAFé"); + // Pure non-ASCII input is returned verbatim. + ASSERT_EQ(StringUtils::ToLower("日本語"), "日本語"); + ASSERT_EQ(StringUtils::ToUpper("日本語"), "日本語"); +} + +TEST(StringUtilsTest, EqualsIgnoreCase) { + ASSERT_TRUE(StringUtils::EqualsIgnoreCase("AbC", "abc")); + ASSERT_TRUE(StringUtils::EqualsIgnoreCase("", "")); + ASSERT_FALSE(StringUtils::EqualsIgnoreCase("abc", "abcd")); + ASSERT_FALSE(StringUtils::EqualsIgnoreCase("abc", "abd")); + // ASCII case is folded; non-ASCII bytes are compared as-is (no UB). + ASSERT_TRUE(StringUtils::EqualsIgnoreCase("Café", "café")); + ASSERT_FALSE(StringUtils::EqualsIgnoreCase("café", "cafe")); +} + } // namespace iceberg diff --git a/src/iceberg/util/string_util.h b/src/iceberg/util/string_util.h index 36dfba30f..6547ceada 100644 --- a/src/iceberg/util/string_util.h +++ b/src/iceberg/util/string_util.h @@ -20,6 +20,7 @@ #pragma once #include +#include #include #include #include @@ -40,19 +41,22 @@ concept FromChars = requires(const char* p, T& v) { std::from_chars(p, p, v); }; class ICEBERG_EXPORT StringUtils { public: + // NOTE: These only convert ASCII letters (under the default C locale); non-ASCII + // (multibyte UTF-8) bytes are left unchanged. + // See https://github.com/apache/iceberg-cpp/issues/613. static std::string ToLower(std::string_view str) { - return str | std::ranges::views::transform([](char c) { return std::tolower(c); }) | + return str | std::ranges::views::transform(ToLowerAscii) | std::ranges::to(); } static std::string ToUpper(std::string_view str) { - return str | std::ranges::views::transform([](char c) { return std::toupper(c); }) | + return str | std::ranges::views::transform(ToUpperAscii) | std::ranges::to(); } static bool EqualsIgnoreCase(std::string_view lhs, std::string_view rhs) { return std::ranges::equal( - lhs, rhs, [](char lc, char rc) { return std::tolower(lc) == std::tolower(rc); }); + lhs, rhs, [](char lc, char rc) { return ToLowerAscii(lc) == ToLowerAscii(rc); }); } static bool StartsWithIgnoreCase(std::string_view str, std::string_view prefix) { @@ -128,6 +132,18 @@ class ICEBERG_EXPORT StringUtils { } return value; } + + private: + // std::tolower/std::toupper require their argument to be representable as + // unsigned char (or EOF); passing a raw char with a non-ASCII (negative) value is + // undefined behavior, so cast through unsigned char first. + static char ToLowerAscii(char c) { + return static_cast(std::tolower(static_cast(c))); + } + + static char ToUpperAscii(char c) { + return static_cast(std::toupper(static_cast(c))); + } }; /// \brief Transparent hash function that supports std::string_view as lookup key From dd0e57a148b9a5e4b8ddbf493dbf1b0807682267 Mon Sep 17 00:00:00 2001 From: Rahul Goel Date: Mon, 15 Jun 2026 21:28:47 -0400 Subject: [PATCH 2/3] chore: empty commit to trigger CI Co-Authored-By: Claude Opus 4.8 From d8ef19048694f6c8c2783dc70c82c088eb076b77 Mon Sep 17 00:00:00 2001 From: Rahul Goel Date: Tue, 16 Jun 2026 20:17:27 -0400 Subject: [PATCH 3/3] fix locale dependency --- src/iceberg/test/string_util_test.cc | 30 +++++++++++++++++----------- src/iceberg/util/string_util.h | 19 +++++++++--------- 2 files changed, 28 insertions(+), 21 deletions(-) diff --git a/src/iceberg/test/string_util_test.cc b/src/iceberg/test/string_util_test.cc index 85fd35b43..a3fd03760 100644 --- a/src/iceberg/test/string_util_test.cc +++ b/src/iceberg/test/string_util_test.cc @@ -41,16 +41,21 @@ TEST(StringUtilsTest, ToUpper) { ASSERT_EQ(StringUtils::ToUpper("123"), "123"); } -// Non-ASCII (multibyte UTF-8) bytes have the high bit set, i.e. are negative -// when stored in a signed char. Passing them straight to std::tolower/toupper is -// undefined behavior. Conversion only touches ASCII; multibyte bytes pass through -// unchanged. See https://github.com/apache/iceberg-cpp/issues/613. +// Non-ASCII (multibyte UTF-8) bytes have the high bit set, i.e. are negative when stored +// in a signed char. Only ASCII letters are converted; multibyte bytes pass through +// unchanged. The non-ASCII strings are written as explicit UTF-8 byte escapes so the test +// does not depend on the source-file encoding. See +// https://github.com/apache/iceberg-cpp/issues/613. TEST(StringUtilsTest, NonAsciiPassThrough) { - ASSERT_EQ(StringUtils::ToLower("Naïve"), "naïve"); - ASSERT_EQ(StringUtils::ToUpper("café"), "CAFé"); - // Pure non-ASCII input is returned verbatim. - ASSERT_EQ(StringUtils::ToLower("日本語"), "日本語"); - ASSERT_EQ(StringUtils::ToUpper("日本語"), "日本語"); + // "Naïve" -> "naïve" (ï = U+00EF = 0xC3 0xAF; only the ASCII letters change). + ASSERT_EQ(StringUtils::ToLower("Na\xC3\xAFve"), "na\xC3\xAFve"); + // "café" -> "CAFé" (é = U+00E9 = 0xC3 0xA9 stays unchanged). + ASSERT_EQ(StringUtils::ToUpper("caf\xC3\xA9"), "CAF\xC3\xA9"); + // "日本語" (0xE6 0x97 0xA5 0xE6 0x9C 0xAC 0xE8 0xAA 0x9E) is returned verbatim. + ASSERT_EQ(StringUtils::ToLower("\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E"), + "\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E"); + ASSERT_EQ(StringUtils::ToUpper("\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E"), + "\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E"); } TEST(StringUtilsTest, EqualsIgnoreCase) { @@ -58,9 +63,10 @@ TEST(StringUtilsTest, EqualsIgnoreCase) { ASSERT_TRUE(StringUtils::EqualsIgnoreCase("", "")); ASSERT_FALSE(StringUtils::EqualsIgnoreCase("abc", "abcd")); ASSERT_FALSE(StringUtils::EqualsIgnoreCase("abc", "abd")); - // ASCII case is folded; non-ASCII bytes are compared as-is (no UB). - ASSERT_TRUE(StringUtils::EqualsIgnoreCase("Café", "café")); - ASSERT_FALSE(StringUtils::EqualsIgnoreCase("café", "cafe")); + // ASCII case is folded; non-ASCII bytes are compared as-is. ("Café" vs "café") + ASSERT_TRUE(StringUtils::EqualsIgnoreCase("Caf\xC3\xA9", "caf\xC3\xA9")); + // "café" vs "cafe": the multibyte é differs from ASCII 'e'. + ASSERT_FALSE(StringUtils::EqualsIgnoreCase("caf\xC3\xA9", "cafe")); } } // namespace iceberg diff --git a/src/iceberg/util/string_util.h b/src/iceberg/util/string_util.h index 6547ceada..01b6087b8 100644 --- a/src/iceberg/util/string_util.h +++ b/src/iceberg/util/string_util.h @@ -41,8 +41,8 @@ concept FromChars = requires(const char* p, T& v) { std::from_chars(p, p, v); }; class ICEBERG_EXPORT StringUtils { public: - // NOTE: These only convert ASCII letters (under the default C locale); non-ASCII - // (multibyte UTF-8) bytes are left unchanged. + // NOTE: These convert ASCII letters only; all other bytes, including non-ASCII + // (multibyte UTF-8) bytes, are passed through unchanged. // See https://github.com/apache/iceberg-cpp/issues/613. static std::string ToLower(std::string_view str) { return str | std::ranges::views::transform(ToLowerAscii) | @@ -134,15 +134,16 @@ class ICEBERG_EXPORT StringUtils { } private: - // std::tolower/std::toupper require their argument to be representable as - // unsigned char (or EOF); passing a raw char with a non-ASCII (negative) value is - // undefined behavior, so cast through unsigned char first. - static char ToLowerAscii(char c) { - return static_cast(std::tolower(static_cast(c))); + // ASCII-only case conversion using explicit range checks rather than + // std::tolower/std::toupper. This is independent of the current C locale and never + // touches non-ASCII (high-bit) bytes, so multibyte UTF-8 sequences are preserved. It + // also sidesteps the undefined behavior of passing a negative char to . + static constexpr char ToLowerAscii(char c) noexcept { + return (c >= 'A' && c <= 'Z') ? static_cast(c - 'A' + 'a') : c; } - static char ToUpperAscii(char c) { - return static_cast(std::toupper(static_cast(c))); + static constexpr char ToUpperAscii(char c) noexcept { + return (c >= 'a' && c <= 'z') ? static_cast(c - 'a' + 'A') : c; } };