From 6cada82609ca9487d750caeafcb0080752944c66 Mon Sep 17 00:00:00 2001
From: taynpg <taynpg@163.com>
Date: Tue, 14 Jan 2025 10:30:07 +0800
Subject: [PATCH] =?UTF-8?q?add=EF=BC=9A=E6=B7=BB=E5=8A=A0=E5=8E=BB?=
 =?UTF-8?q?=E9=99=A4=E9=9D=9EASCII=E9=99=84=E8=BF=91=E7=A9=BA=E7=99=BD?=
 =?UTF-8?q?=E5=87=BD=E6=95=B0=E3=80=82?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 CMakeLists.txt    |  1 +
 include/of_util.h |  5 +++++
 src/of_util.cpp   | 42 ++++++++++++++++++++++++++++++++++++++++++
 test/main.cpp     | 11 ++++++++++-
 4 files changed, 58 insertions(+), 1 deletion(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e391197..86428c5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -24,6 +24,7 @@ set(SRC_FILES
 )
 
 include_directories(include)
+include_directories(3rd)
 if(DEFINED USE_TEST)
     message(STATUS "USE TEST")
     enable_testing()
diff --git a/include/of_util.h b/include/of_util.h
index e0b145e..8834554 100644
--- a/include/of_util.h
+++ b/include/of_util.h
@@ -245,6 +245,11 @@ public:
     static std::string u8ToGBK(const std::string& str);
     static std::string GBKTou8(const std::string& str);
 #endif
+    /// @brief 删除，段落中的空白字符，如[你好 啊，在 哪里 ？] => [你好啊，在哪里？]
+    ///        仅处理非 ASCII 码附近的内容。
+    /// @param str 
+    /// @return 
+    static ofString rbs(const ofString& str);
 };
 
 typedef class CThreadSleep
diff --git a/src/of_util.cpp b/src/of_util.cpp
index 06bbbc1..88a1075 100644
--- a/src/of_util.cpp
+++ b/src/of_util.cpp
@@ -2,6 +2,7 @@
 #include <chrono>
 #include <iomanip>
 #include <sstream>
+#include <utf8.h>
 
 #ifdef _WIN32
 #include <windows.h>
@@ -150,6 +151,47 @@ std::string CCodec::GBKTou8(const std::string& str)
     return utf8Str;
 }
 #endif
+
+ofString CCodec::rbs(const ofString& str)
+{
+    std::string utf8_str;
+#ifdef UNICODE_OFSTR
+    utf8::utf16to8(str.begin(), str.end(), std::back_inserter(utf8_str));
+#else
+    utf8_str = str;
+#endif
+    std::vector<char32_t> unicode_chars;
+    utf8::utf8to32(utf8_str.begin(), utf8_str.end(), std::back_inserter(unicode_chars));
+    std::vector<char32_t> processed_chars;
+    for (size_t i = 0; i < unicode_chars.size(); ++i) {
+        char32_t current = unicode_chars[i];
+        if (current == U' ' || current == U'\t' || current == U'\n' || current == U'\r') {
+            bool near_non_ascii = false;
+            if (i > 0 && unicode_chars[i - 1] > 0x7F) {
+                near_non_ascii = true;
+            }
+            if (i + 1 < unicode_chars.size() && unicode_chars[i + 1] > 0x7F) {
+                near_non_ascii = true;
+            }
+            if (near_non_ascii) {
+                continue;
+            }
+        }
+        processed_chars.push_back(current);
+    }
+
+    std::string result_utf8;
+    utf8::utf32to8(processed_chars.begin(), processed_chars.end(), std::back_inserter(result_utf8));
+
+    ofString result;
+#ifdef UNICODE_OFSTR
+    utf8::utf8to16(result_utf8.begin(), result_utf8.end(), std::back_inserter(result));
+#else
+    result = result_utf8;
+#endif
+    return result;
+}
+
 CThreadSleep::CThreadSleep()
 {
     is_stop_sleep_ = false;
diff --git a/test/main.cpp b/test/main.cpp
index 0e05048..8c8b5b6 100644
--- a/test/main.cpp
+++ b/test/main.cpp
@@ -1,6 +1,7 @@
 #include <iostream>
-#include "of_str.h"
+#include <of_str.h>
 #include <of_path.h>
+#include <of_util.h>
 #include <cassert>
 
 using namespace ofen;
@@ -19,10 +20,18 @@ void testB()
     assert(rp == ofT("cpNiz"));
 }
 
+void testC()
+{
+    std::string source(u8"这是 一 个测试 用例。 ");
+    std::string expect(u8"这是一个测试用例。");
+    assert(CCodec::rbs(source) == expect);
+}
+
 int main()
 {
     testA();
     testB();
+    testC();
     std::cout << "Done" << std::endl;
     return 0;
 }
\ No newline at end of file