From ea1007a31f6c8829c1a23449af3aeaed10fe6d64 Mon Sep 17 00:00:00 2001 From: keqingmoe Date: Fri, 7 Feb 2025 05:16:55 +0800 Subject: [PATCH] =?UTF-8?q?=E6=96=B0=E5=A2=9E=20module/utf.cppm?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- module/utf.cppm | 267 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 267 insertions(+) create mode 100644 module/utf.cppm diff --git a/module/utf.cppm b/module/utf.cppm new file mode 100644 index 0000000..9370b0d --- /dev/null +++ b/module/utf.cppm @@ -0,0 +1,267 @@ +export module kqm.str.utf; + +import std; + +template +struct tag_limit +{ + using type = From; +}; + +template From> +struct tag_limit +{ + using type = To; +}; + +template +using tag_limit_t = tag_limit::type; + +export namespace kqm +{ + +namespace utf +{ + +template +concept Utf8CharOrByteType = std::same_as || std::same_as || std::same_as; + +template +concept Utf32CharType = std::same_as || std::same_as; + +template +concept Utf8ByteIterator = std::input_iterator + && (std::convertible_to, std::byte> + || Utf8CharOrByteType>>); + +template +constexpr auto is_valid_codepoint(Char codepoint) noexcept +{ + const auto cp = static_cast(codepoint); + + return !(cp > 0x10'FFFF || (cp >= 0xD800 && cp <= 0xDFFF)); +} + +template +constexpr auto is_valid_utf8_byte(Char byte) noexcept +{ + const auto value = static_cast(byte); + + if (value == 0xC0) return false; + if (value == 0xC1) return false; + if (value >= 0xF5) return false; + return true; +} + +template +constexpr auto utf8_length_from_utf32(Char codepoint) noexcept -> std::size_t +{ + const auto cp = static_cast(codepoint); + + if (cp <= 0x7F) { + return 1uz; + } else if (cp <= 0x7FF) { + return 2uz; + } else if (cp <= 0xFFFF) { + return 3uz; + } else { + return 4uz; + } +} + +template +constexpr auto utf8_lead_byte_length(Byte lead) noexcept -> std::size_t +{ + auto byte = static_cast(lead); + + if (byte < 0x80) return 1uz; + if ((byte & 0xE0) == 0xC0) return 2uz; + if ((byte & 0xF0) == 0xE0) return 3uz; + return 4uz; +} + +template + requires Utf8CharOrByteType>> + && std::output_iterator>> +auto utf32_to_utf8(char32_t codepoint, Out out) noexcept +{ + using Byte = std::remove_reference_t>; + + const auto cp = static_cast(codepoint); + + if (cp <= 0x7F) { + *out = static_cast(cp); + } else if (cp <= 0x7FF) { + *out = static_cast(0xC0 | (cp >> 6)); + ++out; + *out = static_cast(0x80 | (cp & 0x3F)); + } else if (cp <= 0xFFFF) { + *out = static_cast(0xE0 | (cp >> 12)); + ++out; + *out = static_cast(0x80 | ((cp >> 6) & 0x3F)); + ++out; + *out = static_cast(0x80 | (cp & 0x3F)); + } else { + *out = static_cast(0xF0 | (cp >> 18)); + ++out; + *out = static_cast(0x80 | ((cp >> 12) & 0x3F)); + ++out; + *out = static_cast(0x80 | ((cp >> 6) & 0x3F)); + ++out; + *out = static_cast(0x80 | (cp & 0x3F)); + } + ++out; + return out; +} + +template +constexpr auto utf8_to_utf32(In in) noexcept -> std::pair +{ + const auto lead = static_cast(*in); + + auto ret = char32_t{}; + if (lead < 0x80) { + ret = lead; + } else if ((lead & 0xE0) == 0xC0) { + ret = (lead & 0x1F) << 6; + ++in; + ret |= static_cast(*in) & 0x3F; + } else if ((lead & 0xF0) == 0xE0) { + ret = (lead & 0x0F) << 12; + ++in; + ret |= (static_cast(*in) & 0x3F) << 6; + ++in; + ret |= (static_cast(*in) & 0x3F); + } else { + ret = (lead & 0x0F) << 18; + ++in; + ret |= (static_cast(*in) & 0x3F) << 12; + ++in; + ret |= (static_cast(*in) & 0x3F) << 6; + ++in; + ret |= (static_cast(*in) & 0x3F); + } + ++in; + return {ret, in}; +} + +template +struct utf8_iterator +{ +private: + Iter iter_; + + using base_trait = std::iterator_traits; + + template + using limiter = tag_limit_t; + +public: + using iterator_concept = limiter; + using iterator_category = limiter; + using value_type = char32_t; + using difference_type = std::ptrdiff_t; + + explicit constexpr utf8_iterator() noexcept = default; + + explicit constexpr utf8_iterator(Iter iter) noexcept : iter_{iter} {} + + constexpr auto operator*() const noexcept -> char32_t + { + return utf8_to_utf32(iter_).first; + } + + constexpr auto operator++(this utf8_iterator& self) noexcept -> utf8_iterator& + { + std::advance(self.iter_, utf8_lead_byte_length(*self.iter_)); + return self; + } + + constexpr auto operator++(this utf8_iterator self, int) noexcept -> utf8_iterator + { + ++self; + return self; + } + + constexpr auto operator--(this utf8_iterator& self) noexcept -> utf8_iterator& + requires std::bidirectional_iterator + { + do { + --self.iter_; + } while ((static_cast(*self.iter_) & 0xC0) == 0x80); + return self; + } + + constexpr auto operator--(this utf8_iterator self, int) noexcept -> utf8_iterator + requires std::bidirectional_iterator + { + --self; + return self; + } + + constexpr auto base() const noexcept -> const Iter& + { + return iter_; + } +}; + +template +constexpr auto operator==(const utf8_iterator& lhs, const utf8_iterator& rhs) + noexcept(noexcept(lhs.base() == rhs.base())) -> bool + requires std::equality_comparable_with +{ + return lhs.base() == rhs.base(); +} + +template +constexpr auto operator<=>(const utf8_iterator& lhs, const utf8_iterator& rhs) + noexcept(noexcept(lhs.base() <=> rhs.base())) + requires std::three_way_comparable_with +{ + return lhs.base() <=> rhs.base(); +} + +constexpr auto iterate_utf8(Utf8ByteIterator auto iter) noexcept +{ + return utf8_iterator{std::move(iter)}; +} + +constexpr auto iterate_utf8(Utf8ByteIterator auto first, Utf8ByteIterator auto last) noexcept +{ + return std::ranges::subrange{iterate_utf8(std::move(first)), iterate_utf8(std::move(last))}; +} + +template Sentinel> + requires(!Utf8ByteIterator) +constexpr auto iterate_utf8(Iter first, Sentinel last) noexcept +{ + return std::ranges::subrange{iterate_utf8(std::move(first)), std::move(last)}; +} + +} + +} + +struct as_utf32_t : std::ranges::range_adaptor_closure +{ + static constexpr auto operator()(std::ranges::range auto&& r) noexcept + { + return kqm::utf::iterate_utf8(std::ranges::begin(std::forward_like(r)), + std::ranges::end(std::forward_like(r))); + } +}; + + +export namespace kqm +{ + +namespace ranges::views +{ + +constexpr auto as_utf32 = as_utf32_t{}; + +} + +namespace views = ranges::views; + +}