From 6cada82609ca9487d750caeafcb0080752944c66 Mon Sep 17 00:00:00 2001 From: taynpg Date: Tue, 14 Jan 2025 10:30:07 +0800 Subject: [PATCH] =?UTF-8?q?add=EF=BC=9A=E6=B7=BB=E5=8A=A0=E5=8E=BB?= =?UTF-8?q?=E9=99=A4=E9=9D=9EASCII=E9=99=84=E8=BF=91=E7=A9=BA=E7=99=BD?= =?UTF-8?q?=E5=87=BD=E6=95=B0=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CMakeLists.txt | 1 + include/of_util.h | 5 +++++ src/of_util.cpp | 42 ++++++++++++++++++++++++++++++++++++++++++ test/main.cpp | 11 ++++++++++- 4 files changed, 58 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e391197..86428c5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -24,6 +24,7 @@ set(SRC_FILES ) include_directories(include) +include_directories(3rd) if(DEFINED USE_TEST) message(STATUS "USE TEST") enable_testing() diff --git a/include/of_util.h b/include/of_util.h index e0b145e..8834554 100644 --- a/include/of_util.h +++ b/include/of_util.h @@ -245,6 +245,11 @@ public: static std::string u8ToGBK(const std::string& str); static std::string GBKTou8(const std::string& str); #endif + /// @brief 删除,段落中的空白字符,如[你好 啊,在 哪里 ?] => [你好啊,在哪里?] + /// 仅处理非 ASCII 码附近的内容。 + /// @param str + /// @return + static ofString rbs(const ofString& str); }; typedef class CThreadSleep diff --git a/src/of_util.cpp b/src/of_util.cpp index 06bbbc1..88a1075 100644 --- a/src/of_util.cpp +++ b/src/of_util.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #ifdef _WIN32 #include @@ -150,6 +151,47 @@ std::string CCodec::GBKTou8(const std::string& str) return utf8Str; } #endif + +ofString CCodec::rbs(const ofString& str) +{ + std::string utf8_str; +#ifdef UNICODE_OFSTR + utf8::utf16to8(str.begin(), str.end(), std::back_inserter(utf8_str)); +#else + utf8_str = str; +#endif + std::vector unicode_chars; + utf8::utf8to32(utf8_str.begin(), utf8_str.end(), std::back_inserter(unicode_chars)); + std::vector processed_chars; + for (size_t i = 0; i < unicode_chars.size(); ++i) { + char32_t current = unicode_chars[i]; + if (current == U' ' || current == U'\t' || current == U'\n' || current == U'\r') { + bool near_non_ascii = false; + if (i > 0 && unicode_chars[i - 1] > 0x7F) { + near_non_ascii = true; + } + if (i + 1 < unicode_chars.size() && unicode_chars[i + 1] > 0x7F) { + near_non_ascii = true; + } + if (near_non_ascii) { + continue; + } + } + processed_chars.push_back(current); + } + + std::string result_utf8; + utf8::utf32to8(processed_chars.begin(), processed_chars.end(), std::back_inserter(result_utf8)); + + ofString result; +#ifdef UNICODE_OFSTR + utf8::utf8to16(result_utf8.begin(), result_utf8.end(), std::back_inserter(result)); +#else + result = result_utf8; +#endif + return result; +} + CThreadSleep::CThreadSleep() { is_stop_sleep_ = false; diff --git a/test/main.cpp b/test/main.cpp index 0e05048..8c8b5b6 100644 --- a/test/main.cpp +++ b/test/main.cpp @@ -1,6 +1,7 @@ #include -#include "of_str.h" +#include #include +#include #include using namespace ofen; @@ -19,10 +20,18 @@ void testB() assert(rp == ofT("cpNiz")); } +void testC() +{ + std::string source(u8"这是 一 个测试 用例。 "); + std::string expect(u8"这是一个测试用例。"); + assert(CCodec::rbs(source) == expect); +} + int main() { testA(); testB(); + testC(); std::cout << "Done" << std::endl; return 0; } \ No newline at end of file