diff --git a/src/iceberg/test/string_util_test.cc b/src/iceberg/test/string_util_test.cc index 402588114..a3fd03760 100644 --- a/src/iceberg/test/string_util_test.cc +++ b/src/iceberg/test/string_util_test.cc @@ -41,4 +41,32 @@ TEST(StringUtilsTest, ToUpper) { ASSERT_EQ(StringUtils::ToUpper("123"), "123"); } +// Non-ASCII (multibyte UTF-8) bytes have the high bit set, i.e. are negative when stored +// in a signed char. Only ASCII letters are converted; multibyte bytes pass through +// unchanged. The non-ASCII strings are written as explicit UTF-8 byte escapes so the test +// does not depend on the source-file encoding. See +// https://github.com/apache/iceberg-cpp/issues/613. +TEST(StringUtilsTest, NonAsciiPassThrough) { + // "Naïve" -> "naïve" (ï = U+00EF = 0xC3 0xAF; only the ASCII letters change). + ASSERT_EQ(StringUtils::ToLower("Na\xC3\xAFve"), "na\xC3\xAFve"); + // "café" -> "CAFé" (é = U+00E9 = 0xC3 0xA9 stays unchanged). + ASSERT_EQ(StringUtils::ToUpper("caf\xC3\xA9"), "CAF\xC3\xA9"); + // "日本語" (0xE6 0x97 0xA5 0xE6 0x9C 0xAC 0xE8 0xAA 0x9E) is returned verbatim. + ASSERT_EQ(StringUtils::ToLower("\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E"), + "\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E"); + ASSERT_EQ(StringUtils::ToUpper("\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E"), + "\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E"); +} + +TEST(StringUtilsTest, EqualsIgnoreCase) { + ASSERT_TRUE(StringUtils::EqualsIgnoreCase("AbC", "abc")); + ASSERT_TRUE(StringUtils::EqualsIgnoreCase("", "")); + ASSERT_FALSE(StringUtils::EqualsIgnoreCase("abc", "abcd")); + ASSERT_FALSE(StringUtils::EqualsIgnoreCase("abc", "abd")); + // ASCII case is folded; non-ASCII bytes are compared as-is. ("Café" vs "café") + ASSERT_TRUE(StringUtils::EqualsIgnoreCase("Caf\xC3\xA9", "caf\xC3\xA9")); + // "café" vs "cafe": the multibyte é differs from ASCII 'e'. + ASSERT_FALSE(StringUtils::EqualsIgnoreCase("caf\xC3\xA9", "cafe")); +} + } // namespace iceberg diff --git a/src/iceberg/util/string_util.h b/src/iceberg/util/string_util.h index 36dfba30f..01b6087b8 100644 --- a/src/iceberg/util/string_util.h +++ b/src/iceberg/util/string_util.h @@ -20,6 +20,7 @@ #pragma once #include +#include #include #include #include @@ -40,19 +41,22 @@ concept FromChars = requires(const char* p, T& v) { std::from_chars(p, p, v); }; class ICEBERG_EXPORT StringUtils { public: + // NOTE: These convert ASCII letters only; all other bytes, including non-ASCII + // (multibyte UTF-8) bytes, are passed through unchanged. + // See https://github.com/apache/iceberg-cpp/issues/613. static std::string ToLower(std::string_view str) { - return str | std::ranges::views::transform([](char c) { return std::tolower(c); }) | + return str | std::ranges::views::transform(ToLowerAscii) | std::ranges::to(); } static std::string ToUpper(std::string_view str) { - return str | std::ranges::views::transform([](char c) { return std::toupper(c); }) | + return str | std::ranges::views::transform(ToUpperAscii) | std::ranges::to(); } static bool EqualsIgnoreCase(std::string_view lhs, std::string_view rhs) { return std::ranges::equal( - lhs, rhs, [](char lc, char rc) { return std::tolower(lc) == std::tolower(rc); }); + lhs, rhs, [](char lc, char rc) { return ToLowerAscii(lc) == ToLowerAscii(rc); }); } static bool StartsWithIgnoreCase(std::string_view str, std::string_view prefix) { @@ -128,6 +132,19 @@ class ICEBERG_EXPORT StringUtils { } return value; } + + private: + // ASCII-only case conversion using explicit range checks rather than + // std::tolower/std::toupper. This is independent of the current C locale and never + // touches non-ASCII (high-bit) bytes, so multibyte UTF-8 sequences are preserved. It + // also sidesteps the undefined behavior of passing a negative char to . + static constexpr char ToLowerAscii(char c) noexcept { + return (c >= 'A' && c <= 'Z') ? static_cast(c - 'A' + 'a') : c; + } + + static constexpr char ToUpperAscii(char c) noexcept { + return (c >= 'a' && c <= 'z') ? static_cast(c - 'a' + 'A') : c; + } }; /// \brief Transparent hash function that supports std::string_view as lookup key