From f42e2dac535f2ef86788288518f8ffef08bdc429 Mon Sep 17 00:00:00 2001 From: Rahul Goel Date: Thu, 18 Jun 2026 21:35:54 -0400 Subject: [PATCH 1/2] feat(string_util): Unicode-aware ToLower via utf8proc Replace the ASCII-only ToLower with utf8proc simple case mapping so case-insensitive name handling matches Iceberg Java's toLowerCase(Locale.ROOT). ToUpper stays ASCII-only since it is not used for name matching. EqualsIgnoreCase now compares lowercased forms. Wire utf8proc into both the CMake (vendored/system) and Meson builds. See https://github.com/apache/iceberg-cpp/issues/613. --- .../IcebergThirdpartyToolchain.cmake | 57 +++++++++++++++++++ src/iceberg/CMakeLists.txt | 8 ++- src/iceberg/meson.build | 9 ++- src/iceberg/test/string_util_test.cc | 45 ++++++++++----- src/iceberg/util/string_util.cc | 31 ++++++++++ src/iceberg/util/string_util.h | 33 +++++------ subprojects/utf8proc.wrap | 30 ++++++++++ 7 files changed, 177 insertions(+), 36 deletions(-) create mode 100644 subprojects/utf8proc.wrap diff --git a/cmake_modules/IcebergThirdpartyToolchain.cmake b/cmake_modules/IcebergThirdpartyToolchain.cmake index 8e10fd8ec..982390332 100644 --- a/cmake_modules/IcebergThirdpartyToolchain.cmake +++ b/cmake_modules/IcebergThirdpartyToolchain.cmake @@ -421,6 +421,62 @@ function(resolve_croaring_dependency) PARENT_SCOPE) endfunction() +# ---------------------------------------------------------------------- +# utf8proc + +function(resolve_utf8proc_dependency) + prepare_fetchcontent() + + if(DEFINED ENV{ICEBERG_UTF8PROC_URL}) + set(UTF8PROC_URL "$ENV{ICEBERG_UTF8PROC_URL}") + else() + set(UTF8PROC_URL + "https://github.com/JuliaStrings/utf8proc/archive/refs/tags/v2.10.0.tar.gz") + endif() + + fetchcontent_declare(utf8proc + ${FC_DECLARE_COMMON_OPTIONS} + URL ${UTF8PROC_URL} + FIND_PACKAGE_ARGS + NAMES + utf8proc + CONFIG) + fetchcontent_makeavailable(utf8proc) + + if(utf8proc_SOURCE_DIR) + if(NOT TARGET utf8proc::utf8proc) + add_library(utf8proc::utf8proc INTERFACE IMPORTED) + target_link_libraries(utf8proc::utf8proc INTERFACE utf8proc) + target_include_directories(utf8proc::utf8proc INTERFACE ${utf8proc_SOURCE_DIR}) + endif() + + set(UTF8PROC_VENDORED TRUE) + # utf8proc's CMake puts a raw build-tree path in INTERFACE_INCLUDE_DIRECTORIES, which + # install(EXPORT) rejects. Wrap it in BUILD_INTERFACE so the export is valid; utf8proc + # is a private dependency, so installed consumers never need its headers. + set_target_properties(utf8proc + PROPERTIES OUTPUT_NAME "iceberg_vendored_utf8proc" + POSITION_INDEPENDENT_CODE ON + INTERFACE_INCLUDE_DIRECTORIES + "$") + install(TARGETS utf8proc + EXPORT iceberg_targets + RUNTIME DESTINATION "${ICEBERG_INSTALL_BINDIR}" + ARCHIVE DESTINATION "${ICEBERG_INSTALL_LIBDIR}" + LIBRARY DESTINATION "${ICEBERG_INSTALL_LIBDIR}") + else() + set(UTF8PROC_VENDORED FALSE) + list(APPEND ICEBERG_SYSTEM_DEPENDENCIES utf8proc) + endif() + + set(ICEBERG_SYSTEM_DEPENDENCIES + ${ICEBERG_SYSTEM_DEPENDENCIES} + PARENT_SCOPE) + set(UTF8PROC_VENDORED + ${UTF8PROC_VENDORED} + PARENT_SCOPE) +endfunction() + # ---------------------------------------------------------------------- # nlohmann-json @@ -719,6 +775,7 @@ endfunction() resolve_zlib_dependency() resolve_nanoarrow_dependency() resolve_croaring_dependency() +resolve_utf8proc_dependency() resolve_nlohmann_json_dependency() resolve_spdlog_dependency() diff --git a/src/iceberg/CMakeLists.txt b/src/iceberg/CMakeLists.txt index 04a9322a1..a14c52729 100644 --- a/src/iceberg/CMakeLists.txt +++ b/src/iceberg/CMakeLists.txt @@ -145,23 +145,27 @@ list(APPEND "$,nanoarrow::nanoarrow_static,$,nanoarrow::nanoarrow_static,nanoarrow::nanoarrow_shared>>" nlohmann_json::nlohmann_json spdlog::spdlog + utf8proc::utf8proc ZLIB::ZLIB) list(APPEND ICEBERG_SHARED_BUILD_INTERFACE_LIBS "$,nanoarrow::nanoarrow_static,$,nanoarrow::nanoarrow_shared,nanoarrow::nanoarrow_static>>" nlohmann_json::nlohmann_json spdlog::spdlog + utf8proc::utf8proc ZLIB::ZLIB) list(APPEND ICEBERG_STATIC_INSTALL_INTERFACE_LIBS "$,iceberg::nanoarrow_static,$,nanoarrow::nanoarrow_static,nanoarrow::nanoarrow_shared>>" "$,iceberg::nlohmann_json,$,nlohmann_json::nlohmann_json,nlohmann_json::nlohmann_json>>" - "$,iceberg::spdlog,spdlog::spdlog>") + "$,iceberg::spdlog,spdlog::spdlog>" + "$,iceberg::utf8proc,utf8proc::utf8proc>") list(APPEND ICEBERG_SHARED_INSTALL_INTERFACE_LIBS "$,iceberg::nanoarrow_static,$,nanoarrow::nanoarrow_shared,nanoarrow::nanoarrow_static>>" "$,iceberg::nlohmann_json,$,nlohmann_json::nlohmann_json,nlohmann_json::nlohmann_json>>" - "$,iceberg::spdlog,spdlog::spdlog>") + "$,iceberg::spdlog,spdlog::spdlog>" + "$,iceberg::utf8proc,utf8proc::utf8proc>") add_iceberg_lib(iceberg SOURCES diff --git a/src/iceberg/meson.build b/src/iceberg/meson.build index f0b103828..f69ce36c0 100644 --- a/src/iceberg/meson.build +++ b/src/iceberg/meson.build @@ -190,8 +190,15 @@ nanoarrow_dep = dependency('nanoarrow') nlohmann_json_dep = dependency('nlohmann_json') spdlog_dep = dependency('spdlog') zlib_dep = dependency('zlib') +utf8proc_dep = dependency('libutf8proc') -iceberg_deps = [nanoarrow_dep, nlohmann_json_dep, spdlog_dep, zlib_dep] +iceberg_deps = [ + nanoarrow_dep, + nlohmann_json_dep, + spdlog_dep, + zlib_dep, + utf8proc_dep, +] iceberg_lib = library( 'iceberg', diff --git a/src/iceberg/test/string_util_test.cc b/src/iceberg/test/string_util_test.cc index a3fd03760..3d4422f42 100644 --- a/src/iceberg/test/string_util_test.cc +++ b/src/iceberg/test/string_util_test.cc @@ -41,19 +41,30 @@ TEST(StringUtilsTest, ToUpper) { ASSERT_EQ(StringUtils::ToUpper("123"), "123"); } -// Non-ASCII (multibyte UTF-8) bytes have the high bit set, i.e. are negative when stored -// in a signed char. Only ASCII letters are converted; multibyte bytes pass through -// unchanged. The non-ASCII strings are written as explicit UTF-8 byte escapes so the test -// does not depend on the source-file encoding. See -// https://github.com/apache/iceberg-cpp/issues/613. -TEST(StringUtilsTest, NonAsciiPassThrough) { - // "Naïve" -> "naïve" (ï = U+00EF = 0xC3 0xAF; only the ASCII letters change). - ASSERT_EQ(StringUtils::ToLower("Na\xC3\xAFve"), "na\xC3\xAFve"); - // "café" -> "CAFé" (é = U+00E9 = 0xC3 0xA9 stays unchanged). - ASSERT_EQ(StringUtils::ToUpper("caf\xC3\xA9"), "CAF\xC3\xA9"); - // "日本語" (0xE6 0x97 0xA5 0xE6 0x9C 0xAC 0xE8 0xAA 0x9E) is returned verbatim. +// Non-ASCII strings are written as explicit UTF-8 byte escapes so the test does not +// depend on the source-file encoding. An escape is split before a following hex digit +// (e.g. "...\x9E" "E") so the \x does not absorb it. +// See https://github.com/apache/iceberg-cpp/issues/613. +TEST(StringUtilsTest, ToLowerUnicode) { + // "CAFÉ" -> "café" (É U+00C9 = 0xC3 0x89 -> é U+00E9 = 0xC3 0xA9). + ASSERT_EQ(StringUtils::ToLower("CAF\xC3\x89"), "caf\xC3\xA9"); + // "GROẞE" -> "große": capital sharp S (ẞ U+1E9E) lower-cases to ß (U+00DF), not "ss" + // as casefolding would produce. + ASSERT_EQ(StringUtils::ToLower("GRO\xE1\xBA\x9E" + "E"), + "gro\xC3\x9F" + "e"); + // "日本語" has no case mapping and is returned verbatim. ASSERT_EQ(StringUtils::ToLower("\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E"), "\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E"); + // Invalid UTF-8 (a lone 0xFF byte) is returned unchanged rather than erroring. + ASSERT_EQ(StringUtils::ToLower("\xFF"), "\xFF"); +} + +// ToUpper is intentionally ASCII-only; non-ASCII (multibyte UTF-8) bytes pass through. +TEST(StringUtilsTest, ToUpperAsciiOnly) { + // "café" -> "CAFé" (é stays unchanged). + ASSERT_EQ(StringUtils::ToUpper("caf\xC3\xA9"), "CAF\xC3\xA9"); ASSERT_EQ(StringUtils::ToUpper("\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E"), "\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E"); } @@ -63,9 +74,15 @@ TEST(StringUtilsTest, EqualsIgnoreCase) { ASSERT_TRUE(StringUtils::EqualsIgnoreCase("", "")); ASSERT_FALSE(StringUtils::EqualsIgnoreCase("abc", "abcd")); ASSERT_FALSE(StringUtils::EqualsIgnoreCase("abc", "abd")); - // ASCII case is folded; non-ASCII bytes are compared as-is. ("Café" vs "café") - ASSERT_TRUE(StringUtils::EqualsIgnoreCase("Caf\xC3\xA9", "caf\xC3\xA9")); - // "café" vs "cafe": the multibyte é differs from ASCII 'e'. + // Unicode-aware: "CAFÉ" matches "café". + ASSERT_TRUE(StringUtils::EqualsIgnoreCase("CAF\xC3\x89", "caf\xC3\xA9")); + // "GROẞE" matches "große" under lowercasing (ẞ -> ß). + ASSERT_TRUE( + StringUtils::EqualsIgnoreCase("GRO\xE1\xBA\x9E" + "E", + "gro\xC3\x9F" + "e")); + // Different letters still differ ("café" vs "cafe"). ASSERT_FALSE(StringUtils::EqualsIgnoreCase("caf\xC3\xA9", "cafe")); } diff --git a/src/iceberg/util/string_util.cc b/src/iceberg/util/string_util.cc index 0454a62b5..00b938a07 100644 --- a/src/iceberg/util/string_util.cc +++ b/src/iceberg/util/string_util.cc @@ -19,10 +19,41 @@ #include "iceberg/util/string_util.h" +#include + +#include + #include "iceberg/util/macros.h" namespace iceberg { +std::string StringUtils::ToLower(std::string_view str) { + std::string result; + result.reserve(str.size()); + + const auto* data = reinterpret_cast(str.data()); + const auto size = static_cast(str.size()); + utf8proc_ssize_t offset = 0; + while (offset < size) { + utf8proc_int32_t code_point = 0; + utf8proc_ssize_t consumed = + utf8proc_iterate(data + offset, size - offset, &code_point); + if (consumed < 0) { + // Invalid UTF-8: return the input unchanged rather than erroring. + return std::string(str); + } + // utf8proc has no string-level lower-case helper, so map and re-encode each code + // point individually. utf8proc_tolower is a simple 1:1 mapping (not casefolding). + const utf8proc_int32_t lowered = utf8proc_tolower(code_point); + std::array encoded{}; + const utf8proc_ssize_t written = utf8proc_encode_char(lowered, encoded.data()); + result.append(reinterpret_cast(encoded.data()), + static_cast(written)); + offset += consumed; + } + return result; +} + Result> StringUtils::HexStringToBytes(std::string_view hex) { if (hex.size() % 2 != 0) [[unlikely]] { return InvalidArgument("Hex string must have even length, got: {}", hex.size()); diff --git a/src/iceberg/util/string_util.h b/src/iceberg/util/string_util.h index 01b6087b8..afb28bf2a 100644 --- a/src/iceberg/util/string_util.h +++ b/src/iceberg/util/string_util.h @@ -20,7 +20,6 @@ #pragma once #include -#include #include #include #include @@ -41,22 +40,24 @@ concept FromChars = requires(const char* p, T& v) { std::from_chars(p, p, v); }; class ICEBERG_EXPORT StringUtils { public: - // NOTE: These convert ASCII letters only; all other bytes, including non-ASCII - // (multibyte UTF-8) bytes, are passed through unchanged. - // See https://github.com/apache/iceberg-cpp/issues/613. - static std::string ToLower(std::string_view str) { - return str | std::ranges::views::transform(ToLowerAscii) | - std::ranges::to(); - } - + /// \brief Lower-case a UTF-8 string using Unicode simple case mapping. + /// + /// Mirrors Iceberg Java's case-insensitive handling, which lower-cases names with + /// toLowerCase(Locale.ROOT). Invalid UTF-8 input is returned unchanged. + /// See https://github.com/apache/iceberg-cpp/issues/613. + static std::string ToLower(std::string_view str); + + /// \brief Upper-case ASCII letters; non-ASCII (multibyte UTF-8) bytes pass through + /// unchanged. + /// + /// Unlike ToLower this is ASCII-only, since upper-casing is not used for name matching. static std::string ToUpper(std::string_view str) { return str | std::ranges::views::transform(ToUpperAscii) | std::ranges::to(); } static bool EqualsIgnoreCase(std::string_view lhs, std::string_view rhs) { - return std::ranges::equal( - lhs, rhs, [](char lc, char rc) { return ToLowerAscii(lc) == ToLowerAscii(rc); }); + return ToLower(lhs) == ToLower(rhs); } static bool StartsWithIgnoreCase(std::string_view str, std::string_view prefix) { @@ -134,14 +135,8 @@ class ICEBERG_EXPORT StringUtils { } private: - // ASCII-only case conversion using explicit range checks rather than - // std::tolower/std::toupper. This is independent of the current C locale and never - // touches non-ASCII (high-bit) bytes, so multibyte UTF-8 sequences are preserved. It - // also sidesteps the undefined behavior of passing a negative char to . - static constexpr char ToLowerAscii(char c) noexcept { - return (c >= 'A' && c <= 'Z') ? static_cast(c - 'A' + 'a') : c; - } - + // Avoids std::toupper, which is locale-dependent and has undefined behavior for + // negative char values. static constexpr char ToUpperAscii(char c) noexcept { return (c >= 'a' && c <= 'z') ? static_cast(c - 'a' + 'A') : c; } diff --git a/subprojects/utf8proc.wrap b/subprojects/utf8proc.wrap new file mode 100644 index 000000000..9b33b3bea --- /dev/null +++ b/subprojects/utf8proc.wrap @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[wrap-file] +directory = utf8proc-2.10.0 +source_url = https://github.com/JuliaStrings/utf8proc/releases/download/v2.10.0/utf8proc-2.10.0.tar.gz +source_filename = utf8proc-2.10.0.tar.gz +source_hash = 276a37dc4d1dd24d7896826a579f4439d1e5fe33603add786bb083cab802e23e +patch_filename = utf8proc_2.10.0-1_patch.zip +patch_url = https://wrapdb.mesonbuild.com/v2/utf8proc_2.10.0-1/get_patch +patch_hash = be16c4514603e922f9636045699fe1a6f844d340b9b7c14b809e47253b06a844 +source_fallback_url = https://github.com/mesonbuild/wrapdb/releases/download/utf8proc_2.10.0-1/utf8proc-2.10.0.tar.gz +wrapdb_version = 2.10.0-1 + +[provide] +libutf8proc = utf8proc_dep From b8639d6801e2570a6a3ba9d05e53fd21797218dc Mon Sep 17 00:00:00 2001 From: Rahul Goel Date: Thu, 18 Jun 2026 23:51:09 -0400 Subject: [PATCH 2/2] Add license info to LICENSE --- LICENSE | 92 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) diff --git a/LICENSE b/LICENSE index 374b7fc58..8d8d5ff4e 100644 --- a/LICENSE +++ b/LICENSE @@ -228,3 +228,95 @@ Home page: https://arrow.apache.org/ License: https://www.apache.org/licenses/LICENSE-2.0 -------------------------------------------------------------------------------- + +This product bundles utf8proc, which is available under the MIT License: + +utf8proc is a software package originally developed by Jan Behrens and the rest +of the Public Software Group, now maintained by the Julia-language developers. +All new work on the utf8proc library is licensed under the MIT "expat" license: + +Copyright (c) 2014-2021 by Steven G. Johnson, Jiahao Chen, Tony Kelman, Jonas +Fonseca, and other contributors listed in the git history. + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the "Software"), +to deal in the Software without restriction, including without limitation +the rights to use, copy, modify, merge, publish, distribute, sublicense, +and/or sell copies of the Software, and to permit persons to whom the +Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. + +The original utf8proc is licensed under the same MIT "expat" license: + +Copyright (c) 2009, 2013 Public Software Group e. V., Berlin, Germany + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the "Software"), +to deal in the Software without restriction, including without limitation +the rights to use, copy, modify, merge, publish, distribute, sublicense, +and/or sell copies of the Software, and to permit persons to whom the +Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. + +utf8proc also contains data derived from the Unicode data files. The following +license applies to that data: + +COPYRIGHT AND PERMISSION NOTICE + +Copyright (c) 1991-2007 Unicode, Inc. All rights reserved. Distributed under +the Terms of Use in http://www.unicode.org/copyright.html. + +Permission is hereby granted, free of charge, to any person obtaining a +copy of the Unicode data files and any associated documentation (the "Data +Files") or Unicode software and any associated documentation (the +"Software") to deal in the Data Files or Software without restriction, +including without limitation the rights to use, copy, modify, merge, +publish, distribute, and/or sell copies of the Data Files or Software, and +to permit persons to whom the Data Files or Software are furnished to do +so, provided that (a) the above copyright notice(s) and this permission +notice appear with all copies of the Data Files or Software, (b) both the +above copyright notice(s) and this permission notice appear in associated +documentation, and (c) there is clear notice in each modified Data File or +in the Software as well as in the documentation associated with the Data +File(s) or Software that the data or software has been modified. + +THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY +KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF +THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS +INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR +CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF +USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER +TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR +PERFORMANCE OF THE DATA FILES OR SOFTWARE. + +Except as contained in this notice, the name of a copyright holder shall +not be used in advertising or otherwise to promote the sale, use or other +dealings in these Data Files or Software without prior written +authorization of the copyright holder. + +Unicode and the Unicode logo are trademarks of Unicode, Inc., and may be +registered in some jurisdictions. All other trademarks and registered +trademarks mentioned herein are the property of their respective owners. + +--------------------------------------------------------------------------------