export module kqm.str.utf; import std; template struct tag_limit { using type = From; }; template From> struct tag_limit { using type = To; }; template using tag_limit_t = tag_limit::type; export namespace kqm { namespace utf { template concept Utf8CharOrByteType = std::same_as || std::same_as || std::same_as; template concept Utf32CharType = std::same_as || std::same_as; template concept Utf8ByteIterator = std::input_iterator && (std::convertible_to, std::byte> || Utf8CharOrByteType>>); template constexpr auto is_valid_codepoint(Char codepoint) noexcept { const auto cp = static_cast(codepoint); return !(cp > 0x10'FFFF || (cp >= 0xD800 && cp <= 0xDFFF)); } template constexpr auto is_valid_utf8_byte(Char byte) noexcept { const auto value = static_cast(byte); if (value == 0xC0) return false; if (value == 0xC1) return false; if (value >= 0xF5) return false; return true; } template constexpr auto utf8_length_from_utf32(Char codepoint) noexcept -> std::size_t { const auto cp = static_cast(codepoint); if (cp <= 0x7F) { return 1uz; } else if (cp <= 0x7FF) { return 2uz; } else if (cp <= 0xFFFF) { return 3uz; } else { return 4uz; } } template constexpr auto utf8_lead_byte_length(Byte lead) noexcept -> std::size_t { auto byte = static_cast(lead); if (byte < 0x80) return 1uz; if ((byte & 0xE0) == 0xC0) return 2uz; if ((byte & 0xF0) == 0xE0) return 3uz; return 4uz; } template requires Utf8CharOrByteType>> && std::output_iterator>> auto utf32_to_utf8(char32_t codepoint, Out out) noexcept { using Byte = std::remove_reference_t>; const auto cp = static_cast(codepoint); if (cp <= 0x7F) { *out = static_cast(cp); } else if (cp <= 0x7FF) { *out = static_cast(0xC0 | (cp >> 6)); ++out; *out = static_cast(0x80 | (cp & 0x3F)); } else if (cp <= 0xFFFF) { *out = static_cast(0xE0 | (cp >> 12)); ++out; *out = static_cast(0x80 | ((cp >> 6) & 0x3F)); ++out; *out = static_cast(0x80 | (cp & 0x3F)); } else { *out = static_cast(0xF0 | (cp >> 18)); ++out; *out = static_cast(0x80 | ((cp >> 12) & 0x3F)); ++out; *out = static_cast(0x80 | ((cp >> 6) & 0x3F)); ++out; *out = static_cast(0x80 | (cp & 0x3F)); } ++out; return out; } template constexpr auto utf8_to_utf32(In in) noexcept -> std::pair { const auto lead = static_cast(*in); auto ret = char32_t{}; if (lead < 0x80) { ret = lead; } else if ((lead & 0xE0) == 0xC0) { ret = (lead & 0x1F) << 6; ++in; ret |= static_cast(*in) & 0x3F; } else if ((lead & 0xF0) == 0xE0) { ret = (lead & 0x0F) << 12; ++in; ret |= (static_cast(*in) & 0x3F) << 6; ++in; ret |= (static_cast(*in) & 0x3F); } else { ret = (lead & 0x0F) << 18; ++in; ret |= (static_cast(*in) & 0x3F) << 12; ++in; ret |= (static_cast(*in) & 0x3F) << 6; ++in; ret |= (static_cast(*in) & 0x3F); } ++in; return {ret, in}; } template struct utf8_iterator { private: Iter iter_; using base_trait = std::iterator_traits; template using limiter = tag_limit_t; public: using iterator_concept = limiter; using iterator_category = limiter; using value_type = char32_t; using difference_type = std::ptrdiff_t; explicit constexpr utf8_iterator() noexcept = default; explicit constexpr utf8_iterator(Iter iter) noexcept : iter_{iter} {} constexpr auto operator*() const noexcept -> char32_t { return utf8_to_utf32(iter_).first; } constexpr auto operator++(this utf8_iterator& self) noexcept -> utf8_iterator& { std::advance(self.iter_, utf8_lead_byte_length(*self.iter_)); return self; } constexpr auto operator++(this utf8_iterator self, int) noexcept -> utf8_iterator { ++self; return self; } constexpr auto operator--(this utf8_iterator& self) noexcept -> utf8_iterator& requires std::bidirectional_iterator { do { --self.iter_; } while ((static_cast(*self.iter_) & 0xC0) == 0x80); return self; } constexpr auto operator--(this utf8_iterator self, int) noexcept -> utf8_iterator requires std::bidirectional_iterator { --self; return self; } constexpr auto base() const noexcept -> const Iter& { return iter_; } }; template constexpr auto operator==(const utf8_iterator& lhs, const utf8_iterator& rhs) noexcept(noexcept(lhs.base() == rhs.base())) -> bool requires std::equality_comparable_with { return lhs.base() == rhs.base(); } template constexpr auto operator<=>(const utf8_iterator& lhs, const utf8_iterator& rhs) noexcept(noexcept(lhs.base() <=> rhs.base())) requires std::three_way_comparable_with { return lhs.base() <=> rhs.base(); } constexpr auto iterate_utf8(Utf8ByteIterator auto iter) noexcept { return utf8_iterator{std::move(iter)}; } constexpr auto iterate_utf8(Utf8ByteIterator auto first, Utf8ByteIterator auto last) noexcept { return std::ranges::subrange{iterate_utf8(std::move(first)), iterate_utf8(std::move(last))}; } template Sentinel> requires(!Utf8ByteIterator) constexpr auto iterate_utf8(Iter first, Sentinel last) noexcept { return std::ranges::subrange{iterate_utf8(std::move(first)), std::move(last)}; } } } struct as_utf32_t : std::ranges::range_adaptor_closure { static constexpr auto operator()(std::ranges::range auto&& r) noexcept { return kqm::utf::iterate_utf8(std::ranges::begin(std::forward_like(r)), std::ranges::end(std::forward_like(r))); } }; export namespace kqm { namespace ranges::views { constexpr auto as_utf32 = as_utf32_t{}; } namespace views = ranges::views; }