背景

近期在给一个项目换一些底层接口的日志处理部分。把原始的类 printf 的格式化方式换成 fmtlib / C++ 20 Text Formatting 的方案。

然后发现,替换完一段未执行的代码后,会发生内存写坏的情况。

问题分析

通过跟踪生成的汇编和 gdb 的 info frame 位置。发现替换之后,栈帧位置大幅增加了。比如对这类代码:

#define LOGFMT(...)                                                 \
    do {                                                            \
        auto buf = get_log_buffer_addr();                           \
        auto size = get_log_buffer_size();                          \
        auto fmt_res = fmt::format_to_n(buf, size, __VA_ARGS__);    \
        if (fmt_res.size >= size) {                                 \
            buf[size - 1] = 0;                                      \
        } else {                                                    \
            buf[fmt_res.size] = 0;                                  \
        }                                                           \
        std::cout << buf << '\n';                                   \
    } while(false)

void func1() {
  //...
}

void func2() {
  //...
  LOGFMT("test {}, data {}", a, b);

  func1();
}

LOGFMT(...) 宏的实现里,使用 fmt::format_to_nsnprintffunc1func2 的函数栈顶增量大幅增加。

理论上, LOGFMT() 的代码都处于子作用域里,无论是临时变量还是子作用域里的变量,出了作用域之后应该可以释放并被复用。 但是现在各个编译器(包括GCC、Clang和MSVC)似乎是为了方便调试信息定位和区分变量,都没有复用这部分栈空间。

而在我们项目工程里,有些地方使用了64K栈的协程。我改造的地方属于日志相关的模块,被大量多层级使用。 这两个因素叠加以后,恰好成为了压死骆驼的最后一根稻草。

问题测试代码

这里贴一下独立的复现和测试代码:

// -std=c++17 -O0 -g -ggdb -fno-omit-frame-pointer
// /O0 /Zc:__cplusplus
#include <fmt/format.h>

#ifdef _MSC_VER
#  include <intrin.h>
#endif

#include <iostream>
#include <cstddef>
#include <string>

#ifndef TEST_NOINLINE_NOCLONE
#  if defined(__clang__)
#    if __cplusplus >= 201103L
#      define TEST_NOINLINE_NOCLONE [[gnu::noinline]]
#    else
#      define TEST_NOINLINE_NOCLONE __attribute__((noinline))
#    endif
#  elif defined(__GNUC__) && __GNUC__ > 3
#    if __cplusplus >= 201103L && (__GNUC__ * 100 + __GNUC_MINOR__) >= 408
#      define TEST_NOINLINE_NOCLONE [[gnu::noinline, gnu::noclone]]
#    else
#      define TEST_NOINLINE_NOCLONE __attribute__((noinline, noclone))
#    endif
#  elif defined(_MSC_VER)
#    define TEST_NOINLINE_NOCLONE __declspec(noinline)
#  else
#    define TEST_NOINLINE_NOCLONE
#  endif
#endif

// 模拟日志代码
namespace {
constexpr size_t get_log_buffer_size() {
    return 1 << 20; // 1 MiB
}
char* get_log_buffer_addr() {
    static char log_buffer[get_log_buffer_size()];
    return log_buffer;
}
}

#define LOGFMT(...)                                                 \
    do {                                                            \
        auto buf = get_log_buffer_addr();                           \
        auto size = get_log_buffer_size();                          \
        auto fmt_res = fmt::format_to_n(buf, size, __VA_ARGS__);    \
        if (fmt_res.size >= size) {                                 \
            buf[size - 1] = 0;                                      \
        } else {                                                    \
            buf[fmt_res.size] = 0;                                  \
        }                                                           \
        std::cout << buf << '\n';                                   \
    } while(false)

#ifdef _MSC_VER
#  define TEST_TOP_FRAME_ADDR ((uintptr_t)_AddressOfReturnAddress())
#else
#  define TEST_TOP_FRAME_ADDR ((uintptr_t)__builtin_frame_address(0))
#endif

TEST_NOINLINE_NOCLONE void func_leaf(uintptr_t top, uintptr_t previous) {
    uintptr_t current = TEST_TOP_FRAME_ADDR;
    std::cout<< "======\n"<< "(LEAF) from func_no_var_no_fmt"
        << ", previous offset: "<< (previous - current)
        << ", top offset: "<< (top - current)
        << '\n';
}

TEST_NOINLINE_NOCLONE void func_no_var_no_fmt(uintptr_t top, uintptr_t previous) {
    uintptr_t current = TEST_TOP_FRAME_ADDR;
    std::cout<< "======\n"<< "from func_no_var_fmt_once"
        << ", previous offset: "<< (previous - current)
        << ", top offset: "<< (top - current)
        << '\n';

    func_leaf(top, current);
}

TEST_NOINLINE_NOCLONE void func_no_var_fmt_once(uintptr_t top, uintptr_t previous) {
    uintptr_t current = TEST_TOP_FRAME_ADDR;
    LOGFMT("======\n{}", "func_no_var_fmt_once");
    std::cout<< "from func_no_var_fmt_twice"
        << ", previous offset: "<< (previous - current)
        << ", top offset: "<< (top - current)
        << '\n';

    func_no_var_no_fmt(top, current);
}

TEST_NOINLINE_NOCLONE void func_no_var_fmt_twice(uintptr_t top, uintptr_t previous) {
    uintptr_t current = TEST_TOP_FRAME_ADDR;
    LOGFMT("======\n{}", "func_no_var_fmt_twice");
    LOGFMT("{}: previous offset: {}, top offset: {}",
            "from func_int_var_fmt_once", (previous - current), (top - current));

    func_no_var_fmt_once(top, current);
}


TEST_NOINLINE_NOCLONE void func_int_var_fmt_once(uintptr_t top, uintptr_t previous, int p1) {
    uintptr_t current = TEST_TOP_FRAME_ADDR;
    volatile int p2 = 0;
    
    {
        char buf[1024];
        LOGFMT("======\n{}: sizeof(parameter1): {}, sizeof(stack var): {}",
                "func_int_var_fmt_once", sizeof(p1), sizeof(p2));
    }
    std::cout<< "from func_int_var_fmt_twice"
        << ", p2 addr: " << (uintptr_t)&p2
        << ", previous offset: "<< (previous - current)
        << ", top offset: "<< (top - current)
        << '\n';

    func_no_var_fmt_twice(top, current);
}

TEST_NOINLINE_NOCLONE void func_int_var_fmt_twice(uintptr_t top, uintptr_t previous, int p1) {
    uintptr_t current = TEST_TOP_FRAME_ADDR;
    volatile int p2 = 0;
    
    LOGFMT("======\n{}: sizeof(parameter1): {}, sizeof(stack var): {}",
            "func_int_var_fmt_twice", sizeof(p1), sizeof(p2));
    LOGFMT("{}: p2 addr: {}, previous offset: {}, top offset: {}",
            "from func_string_var_fmt_once", (uintptr_t)&p2, (previous - current), (top - current));

    func_int_var_fmt_once(top, current, p1);
}

TEST_NOINLINE_NOCLONE void func_string_var_fmt_once(uintptr_t top, uintptr_t previous, std::string p1) {
    uintptr_t current = TEST_TOP_FRAME_ADDR;
    volatile std::string p2 = "from " + std::string(__FUNCTION__);
    
    LOGFMT("======\n{}: sizeof(parameter1): {}, sizeof(stack var): {}",
            "func_string_var_fmt_once", sizeof(p1), sizeof(p2));
    std::cout<< p1
        << ", p2 addr: " << (uintptr_t)&p2
        << ", previous offset: "<< (previous - current)
        << ", top offset: "<< (top - current)
        << '\n';

    func_int_var_fmt_twice(top, current, 0);
}

TEST_NOINLINE_NOCLONE void func_string_var_fmt_twice(uintptr_t top, uintptr_t previous, std::string p1) {
    uintptr_t current = TEST_TOP_FRAME_ADDR;
    volatile std::string p2 = "from " + std::string(__FUNCTION__);
    
    LOGFMT("======\n{}: sizeof(parameter1): {}, sizeof(stack var): {}",
            "func_string_var_fmt_twice", sizeof(p1), sizeof(p2));
    LOGFMT("{}: p2 addr: {}: , previous offset: {}, top offset: {}",
            p1, (uintptr_t)&p2, (previous - current), (top - current));

    func_string_var_fmt_once(top, current, "from func_string_var_fmt_twice");
}

TEST_NOINLINE_NOCLONE void func_empty(uintptr_t top, uintptr_t previous) {
    uintptr_t current = TEST_TOP_FRAME_ADDR;
    volatile int p2 = 0;
    
    std::cout<< "func_empty"
        << ", p2 addr: " << (uintptr_t)&p2
        << ", previous offset: "<< (previous - current)
        << ", top offset: "<< (top - current)
        << '\n';

    func_string_var_fmt_twice(top, current, "from func_empty");
}


int main() {
    uintptr_t current = TEST_TOP_FRAME_ADDR;
    func_empty(current, current);
    return 0;
}

各个编译器的编译指令在注释里,可以在 https://godbolt.org 上测试运行。各个编译器的结论都差不多。 这里贴一下 Clang 20 的输出:

func_empty, p2 addr: 140734321135460, previous offset: 32, top offset: 32
======
func_string_var_fmt_twice: sizeof(parameter1): 32, sizeof(stack var): 32
from func_empty: p2 addr: 140734321135184: , previous offset: 160, top offset: 192
======
func_string_var_fmt_once: sizeof(parameter1): 32, sizeof(stack var): 32
from func_string_var_fmt_twice, p2 addr: 140734321134664, previous offset: 576, top offset: 768
======
func_int_var_fmt_twice: sizeof(parameter1): 4, sizeof(stack var): 4
from func_string_var_fmt_once: p2 addr: 140734321134316, previous offset: 384, top offset: 1152
======
func_int_var_fmt_once: sizeof(parameter1): 4, sizeof(stack var): 4
from func_int_var_fmt_twice, p2 addr: 140734321134052, previous offset: 288, top offset: 1440
======
func_no_var_fmt_twice
from func_int_var_fmt_once: previous offset: 176, top offset: 1616
======
func_no_var_fmt_once
from func_no_var_fmt_twice, previous offset: 240, top offset: 1856
======
from func_no_var_fmt_once, previous offset: 128, top offset: 1984
======
(LEAF) from func_no_var_no_fmt, previous offset: 48, top offset: 2032

可以看到,每次调用fmt接口增量都挺大。在 -O2 下可能是涉及内存对齐(我没有再仔细查看汇编),offset会更大。

解决方案

这个问题我试了一些方案,都无法零开销地解决这个问题。最后采用的方法是利用匿名的Lambda对象做一次中转。相当于把内存增量控制在Lambda表达式的对象本身的开销上。(直接加函数中转可能会被内联,从而导致栈仍然无法被复用)

按上面的代码例子就是 LOGFMT(...) 改成如下形式, 其他代码不变:

#define LOGFMT(...)                                                 \
    ([&]() {                                                        \
        auto buf = get_log_buffer_addr();                           \
        auto size = get_log_buffer_size();                          \
        auto fmt_res = fmt::format_to_n(buf, size, __VA_ARGS__);    \
        if (fmt_res.size >= size) {                                 \
            buf[size - 1] = 0;                                      \
        } else {                                                    \
            buf[fmt_res.size] = 0;                                  \
        }                                                           \
        std::cout << buf << '\n';                                   \
    })()

对比结果如下:

func_empty, p2 addr: 140736400274708, previous offset: 32, top offset: 32
======
func_string_var_fmt_twice: sizeof(parameter1): 32, sizeof(stack var): 32
from func_empty: p2 addr: 140736400274480: , previous offset: 160, top offset: 192
======
func_string_var_fmt_once: sizeof(parameter1): 32, sizeof(stack var): 32
from func_string_var_fmt_twice, p2 addr: 140736400274208, previous offset: 304, top offset: 496
======
func_int_var_fmt_twice: sizeof(parameter1): 4, sizeof(stack var): 4
from func_string_var_fmt_once: p2 addr: 140736400274028, previous offset: 240, top offset: 736
======
func_int_var_fmt_once: sizeof(parameter1): 4, sizeof(stack var): 4
from func_int_var_fmt_twice, p2 addr: 140736400273932, previous offset: 96, top offset: 832
======
func_no_var_fmt_twice
from func_int_var_fmt_once: previous offset: 64, top offset: 896
======
func_no_var_fmt_once
from func_no_var_fmt_twice, previous offset: 80, top offset: 976
======
from func_no_var_fmt_once, previous offset: 48, top offset: 1024
======
(LEAF) from func_no_var_no_fmt, previous offset: 48, top offset: 1072

可以看到,各层级的 previous offset 都已经控制在一个比较小的值里了。

最后

这个问题大多数场景不会遇到,但是涉及栈很小的协程,或者类似其他的应用场景可能都会碰到。

比如 https://github.com/yuanzhubi/call_in_stack 的使用场景。之前和这个库的作者交流说是用在流媒体处理的某些业务处理里。

我暂时没有进一步研究是否有跨平台的方案告知编译器去完全复用这块内存,如果可以的话应该是最佳的解法。

也欢迎有兴趣的小伙伴们互相交流探讨。