Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions src/iceberg/test/string_util_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -41,4 +41,32 @@ TEST(StringUtilsTest, ToUpper) {
ASSERT_EQ(StringUtils::ToUpper("123"), "123");
}

// Non-ASCII (multibyte UTF-8) bytes have the high bit set, i.e. are negative when stored
// in a signed char. Only ASCII letters are converted; multibyte bytes pass through
// unchanged. The non-ASCII strings are written as explicit UTF-8 byte escapes so the test
// does not depend on the source-file encoding. See
// https://github.com/apache/iceberg-cpp/issues/613.
TEST(StringUtilsTest, NonAsciiPassThrough) {
// "Naïve" -> "naïve" (ï = U+00EF = 0xC3 0xAF; only the ASCII letters change).
ASSERT_EQ(StringUtils::ToLower("Na\xC3\xAFve"), "na\xC3\xAFve");
// "café" -> "CAFé" (é = U+00E9 = 0xC3 0xA9 stays unchanged).
ASSERT_EQ(StringUtils::ToUpper("caf\xC3\xA9"), "CAF\xC3\xA9");
// "日本語" (0xE6 0x97 0xA5 0xE6 0x9C 0xAC 0xE8 0xAA 0x9E) is returned verbatim.
ASSERT_EQ(StringUtils::ToLower("\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E"),
"\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E");
ASSERT_EQ(StringUtils::ToUpper("\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E"),
"\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E");
}

TEST(StringUtilsTest, EqualsIgnoreCase) {
ASSERT_TRUE(StringUtils::EqualsIgnoreCase("AbC", "abc"));
ASSERT_TRUE(StringUtils::EqualsIgnoreCase("", ""));
ASSERT_FALSE(StringUtils::EqualsIgnoreCase("abc", "abcd"));
ASSERT_FALSE(StringUtils::EqualsIgnoreCase("abc", "abd"));
// ASCII case is folded; non-ASCII bytes are compared as-is. ("Café" vs "café")
ASSERT_TRUE(StringUtils::EqualsIgnoreCase("Caf\xC3\xA9", "caf\xC3\xA9"));
// "café" vs "cafe": the multibyte é differs from ASCII 'e'.
ASSERT_FALSE(StringUtils::EqualsIgnoreCase("caf\xC3\xA9", "cafe"));
}

} // namespace iceberg
23 changes: 20 additions & 3 deletions src/iceberg/util/string_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#pragma once

#include <algorithm>
#include <cctype>
#include <cerrno>
#include <charconv>
#include <ranges>
Expand All @@ -40,19 +41,22 @@ concept FromChars = requires(const char* p, T& v) { std::from_chars(p, p, v); };

class ICEBERG_EXPORT StringUtils {
public:
// NOTE: These convert ASCII letters only; all other bytes, including non-ASCII
// (multibyte UTF-8) bytes, are passed through unchanged.
// See https://github.com/apache/iceberg-cpp/issues/613.
static std::string ToLower(std::string_view str) {
return str | std::ranges::views::transform([](char c) { return std::tolower(c); }) |
return str | std::ranges::views::transform(ToLowerAscii) |
std::ranges::to<std::string>();
}

static std::string ToUpper(std::string_view str) {
return str | std::ranges::views::transform([](char c) { return std::toupper(c); }) |
return str | std::ranges::views::transform(ToUpperAscii) |
std::ranges::to<std::string>();
}

static bool EqualsIgnoreCase(std::string_view lhs, std::string_view rhs) {
return std::ranges::equal(
lhs, rhs, [](char lc, char rc) { return std::tolower(lc) == std::tolower(rc); });
lhs, rhs, [](char lc, char rc) { return ToLowerAscii(lc) == ToLowerAscii(rc); });
}

static bool StartsWithIgnoreCase(std::string_view str, std::string_view prefix) {
Expand Down Expand Up @@ -128,6 +132,19 @@ class ICEBERG_EXPORT StringUtils {
}
return value;
}

private:
// ASCII-only case conversion using explicit range checks rather than
// std::tolower/std::toupper. This is independent of the current C locale and never
// touches non-ASCII (high-bit) bytes, so multibyte UTF-8 sequences are preserved. It
// also sidesteps the undefined behavior of passing a negative char to <cctype>.
static constexpr char ToLowerAscii(char c) noexcept {
return (c >= 'A' && c <= 'Z') ? static_cast<char>(c - 'A' + 'a') : c;
}

static constexpr char ToUpperAscii(char c) noexcept {
return (c >= 'a' && c <= 'z') ? static_cast<char>(c - 'a' + 'A') : c;
}
};

/// \brief Transparent hash function that supports std::string_view as lookup key
Expand Down
Loading