From 71f18a89192b9381f8ae40e044ebd29227929c2e Mon Sep 17 00:00:00 2001 From: taynpg Date: Wed, 12 Feb 2025 13:36:28 +0800 Subject: [PATCH] =?UTF-8?q?use=EF=BC=9A=E5=88=9D=E6=AD=A5=E8=83=BD?= =?UTF-8?q?=E7=94=A8=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .vscode/settings.json | 4 +- README.md | 6 +- jsondata.cxx | 133 ++++++++++++++++++++++++++++++++++++++++-- jsondata.h | 18 +++++- main.cxx | 49 ++++++++++++---- 5 files changed, 188 insertions(+), 22 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index d21c7b6..de1f5fa 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -18,6 +18,7 @@ } ], "args": [ + "/home/yun/Code/filecomplete", "cpp,h", "/home/yun/Code/question.txt" ] }, "cmake.configureSettings": { @@ -91,6 +92,7 @@ "stdexcept": "cpp", "streambuf": "cpp", "typeinfo": "cpp", - "valarray": "cpp" + "valarray": "cpp", + "fstream": "cpp" } } \ No newline at end of file diff --git a/README.md b/README.md index eb044f4..0f225cb 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # DeepSeek访问 -尝试实现一个可以提交大量附件,并让`DeepSeek`帮分析提取数据等的一个工具。 +尝试实现一个可以提交大量附件(按照官方要求是最多50个文件,单个不超过500MB,且仅支持文本文件),并让`DeepSeek`帮分析提取数据等的一个工具。 -# 说明 +# 编译 -如果是`windows`环境,必须使用`windows terminal`(或者其他支持读入`utf-8`字符串的终端)以支持从`cmd`命令行读入`utf-8`字符串。 \ No newline at end of file +需要`cpp17`。 \ No newline at end of file diff --git a/jsondata.cxx b/jsondata.cxx index d248711..a8c6d39 100644 --- a/jsondata.cxx +++ b/jsondata.cxx @@ -1,7 +1,8 @@ #include "jsondata.h" #include -CJsonOper::CJsonOper() +CJsonOper::CJsonOper(const std::string& user_name, const std::string& model, const std::string& assistant_name) + : user_(user_name), model_(model), assistant_(assistant_name) { } @@ -11,15 +12,53 @@ CJsonOper::~CJsonOper() std::string CJsonOper::format_request(const std::string& content) { - // 定义变量 - std::string model = "deepseek-r1"; - std::string role = "user"; - - // 构造 JSON 对象 + std::string model = model_; + std::string role = user_; nlohmann::json json_data = {{"model", model}, {"messages", {{{"role", role}, {"content", content}}}}}; return json_data.dump(); } +std::vector CJsonOper::split(const std::string& input, const std::string& delimiter) +{ + std::vector result; + size_t pos = 0, prev = 0; + while ((pos = input.find(delimiter, prev)) != std::string::npos) { + result.push_back(input.substr(prev, pos - prev)); + prev = pos + delimiter.size(); + } + result.push_back(input.substr(prev)); + return result; +} + +std::string CJsonOper::multi_format_reuqest(const std::string& content, size_t per_sec_size) +{ + std::string model = model_; + std::string role = user_; + nlohmann::json json_data; + json_data["model"] = model; + + std::vector messages; + size_t s = 0; + while (s < content.size()) { + size_t i = 0; + size_t t = 0; + while (i < per_sec_size && s + i < content.size()) { + t = get_u8_len(content[s + i]); + if (t == 0) { + std::cerr << "invalid codec!!!" << std::endl; + exit(1); + } + i += t; + } + std::string part = content.substr(s, i); + messages.push_back({{"role", role}, {"content", "\n附加数据:\n" + part}}); + s += i; + } + + json_data["messages"] = messages; + return json_data.dump(); +} + Message CJsonOper::parse(const std::string& data) { Message re; @@ -62,3 +101,85 @@ bool CJsonOper::save_md(const std::string& data, const std::string& id) of.close(); return true; } + +bool CJsonOper::read_txt(const std::string& path, std::string& out) +{ + std::ifstream file(path); + if (!file.is_open()) { + std::cout << "open failed: " << path << std::endl; + return false; + } + std::istreambuf_iterator iterf(file); + std::istreambuf_iterator iter; + std::string content(iterf, iter); + out = content; + return true; +} + +size_t CJsonOper::get_u8_len(unsigned char ch) +{ + if (ch <= 0x7F) { + return 1; + } else if ((ch & 0xE0) == 0xC0) { + return 2; + } else if ((ch & 0xF0) == 0xE0) { + return 3; + } else if ((ch & 0xF8) == 0xF0) { + return 4; + } else if ((ch & 0xFC) == 0xF8) { + return 5; + } else if ((ch & 0xFE) == 0xFC) { + return 6; + } else { + std::cerr << "invalid u8 first ch." << std::endl; + exit(1); + } + return 0; +} + +std::string CJsonOper::trim(const std::string& input) +{ + size_t start = input.find_first_not_of(" \t\n\r\f\v"); + if (start == std::string::npos) { + return ""; + } + size_t end = input.find_last_not_of(" \t\n\r\f\v"); + return input.substr(start, end - start + 1); +} + +std::string CJsonOper::get_all_dir_content(const std::string& dir, const std::string& types) +{ + auto vec = split(types, ","); + std::vector t; + for (const auto& item : vec) { + auto c = trim(item); + if (c.empty()) { + continue; + } + t.push_back("." + item); + std::cout << "use type:" << item << std::endl; + } + std::vector task; + for (const auto& entry : fs::directory_iterator(dir)) { + if (!fs::is_regular_file(entry)) { + continue; + } + auto exten = entry.path().filename().extension().string(); + if (std::find(t.begin(), t.end(), exten) != t.end()) { + std::cout << "Parse:" << entry.path().string() << std::endl; + task.push_back(entry.path().string()); + } + } + // 提取内容 + std::string content; + for (const auto& item : task) { + std::string one; + if (read_txt(item, one)) { + content.append("\n\n" + one); + } else { + std::cerr << "Can't read file: " << item << std::endl; + exit(1); + } + } + return content; +} diff --git a/jsondata.h b/jsondata.h index 74d4489..bf68fe6 100644 --- a/jsondata.h +++ b/jsondata.h @@ -1,11 +1,14 @@ #ifndef JSON_DATA #define JSON_DATA +#include #include #include #include -#include +#include +#include +namespace fs = std::filesystem; struct Message { std::string message_content; std::string reasoning_content; @@ -19,13 +22,24 @@ using json = nlohmann::json; class CJsonOper { public: - CJsonOper(); + CJsonOper(const std::string& user_name, const std::string& model, const std::string& assistant_name); ~CJsonOper(); public: std::string format_request(const std::string& content); + std::string multi_format_reuqest(const std::string& content, size_t per_sec_size); Message parse(const std::string& data); static bool save_md(const std::string& data, const std::string& id); + static bool read_txt(const std::string& path, std::string& out); + static std::vector split(const std::string& input, const std::string& delimiter); + static size_t get_u8_len(unsigned char ch); + static std::string get_all_dir_content(const std::string& dir, const std::string& types); + static std::string trim(const std::string& input); + +private: + std::string user_{}; + std::string model_{}; + std::string assistant_{}; }; #endif \ No newline at end of file diff --git a/main.cxx b/main.cxx index bb073dd..f67deb8 100644 --- a/main.cxx +++ b/main.cxx @@ -1,40 +1,69 @@ +#include "jsondata.h" #include "zapi.h" #include #include #include -#include "jsondata.h" + +constexpr auto API_ENV_KEY = "DASHSCOPE_API_KEY"; +constexpr auto BASE_URL = "https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions"; +constexpr auto USER_NAME = "user"; +constexpr auto MODEL_NAME = "deepseek-r1"; +constexpr auto ASSISTANT_NAME = "assistant"; std::string get_key() { - char* v = getenv("DASHSCOPE_API_KEY"); + char* v = getenv(API_ENV_KEY); if (v) { return std::string(v); } return ""; } -int main() +int main(int argc, char* argv[]) { + if (argc < 4) { + std::cout << "note: you need set env[" << API_ENV_KEY << "] before you start." << std::endl; + std::cout << "argument: text type files dir, types(split with ,), question file." << std::endl; + std::cout << "example: deepseek-use /home/zhang/cpps cpp,xml,h /home/zhang/question.txt" << std::endl; + return 0; + } + std::string api_key = get_key(); if (api_key.empty()) { std::cerr << "api key not found." << std::endl; return -1; } + std::string question{}; + if (!CJsonOper::read_txt(argv[3], question)) { + return -1; + } + + std::string all_content = CJsonOper::get_all_dir_content(argv[1], argv[2]); + if (all_content.empty()) { + std::cerr << "content is empty." << std::endl; + return -1; + } + + std::string req_str = question + "\n\n请查看附加数据:\n" + all_content; auto api = std::make_shared(); - auto json_oper = std::make_shared(); - api->set_base("https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions", api_key); - - // 请求的 JSON 数据 - std::string question("对于DeepSeek API,既然对话交互使用的是json格式,那我是否可以把所有的文本内容合并到json中post给deepseek,deepseek有没有说明json格式的数据大小上限值?"); - std::string q = json_oper->format_request(question); + auto json_oper = std::make_shared(USER_NAME, MODEL_NAME, ASSISTANT_NAME); + api->set_base(BASE_URL, api_key); + std::string q = json_oper->multi_format_reuqest(req_str, 1024 * 1); + //std::cout << q << std::endl; std::string recv; if (api->post(q, recv)) { auto re = json_oper->parse(recv); - CJsonOper::save_md("**最终结果:**\n\n" + re.message_content + "\n\n **思考过程:** \n" + re.reasoning_content, re.id); + std::string use = "本次`tokens`消耗:" + std::to_string(re.prompt_tokens) + "+" + std::to_string(re.completion_tokens) + + "=" + std::to_string(re.total_tokens); + CJsonOper::save_md( + use + "\n\n**最终结果:**\n\n" + re.message_content + "\n\n **思考过程:** \n\n" + re.reasoning_content, re.id); std::cout << "success." << std::endl; } + else { + std::cout << "request failed." << std::endl; + } return 0; }