使用 GOTRACEBACK 快速定位你的 Panic

近期遷移了一個 go 的專案至 k8s 機器上，發現機器不時會自動重啟，當想看重啟前日誌的時候，Goroutine 運行的狀態全部都打印了出來，由於公司雲平台查看行數限制，看到最後，還是沒有想要看到的 panic 的關鍵堆棧信息。

前期，由於頻繁的重啟，懷疑是哪里出現了未捕獲的 panic 導致的，於是在使用的第三方包 RocketMQ 和 Talos 等 SDK 包進行了生產消費初始化的 recover 並且對專案中 channel 的關鍵操作中也添加了 recover 捕獲，但是並沒有解決問題，還是時不時的出現重啟。

在查找 Go 官方文檔，發現可以設置這個環境變量 GOTRACEBACK 可以控制 panic 發生後堆棧跟蹤的打印級別。

GOTRACEBACK#

Go 運行時使用該環境變量來決定在程序崩潰或出現未處理的 panic 時應該輸出多少堆棧跟蹤信息，它是在運行 Go 程序時通過環境變量傳遞給 Go 運行時的。

none 當程序崩潰時，不輸出任何堆棧信息；
single 僅顯示導致崩潰，出現 panic 的 goroutine 的堆棧信息；
all 顯示所有的 goroutine 的堆棧信息；
system 顯示所有的 goroutine 的堆棧信息，包括運行時內部 goroutine 的信息；
crash 顯示所有的 goroutine 的堆棧信息，然後核心轉儲程序並退出；

runtime package - runtime - Go Packages

源代碼#

基於 Go 1.20 版本

設置 GOTRACEBACK#

//go:linkname setTraceback runtime/debug.SetTraceback  
func setTraceback(level string) {  
    var t uint32  
    switch level {  
    case "none":  
       t = 0  
    case "single", "":  
       t = 1 << tracebackShift  
    case "all":  
       t = 1<<tracebackShift | tracebackAll  
    case "system":  
       t = 2<<tracebackShift | tracebackAll  
    case "crash":  
       t = 2<<tracebackShift | tracebackAll | tracebackCrash  
    default:  
       t = tracebackAll  
       if n, ok := atoi(level); ok && n == int(uint32(n)) {  
          t |= uint32(n) << tracebackShift  
       }  
    }  
    // when C owns the process, simply exit'ing the process on fatal errors  
    // and panics is surprising. Be louder and abort instead.    if islibrary || isarchive {  
       t |= tracebackCrash  
    }  
  
    t |= traceback_env  
  
    atomic.Store(&traceback_cache, t)  
}

獲取 GOTRACEBACK#

// Keep a cached value to make gotraceback fast,// since we call it on every call to gentraceback.  
// The cached value is a uint32 in which the low bits  
// are the "crash" and "all" settings and the remaining  
// bits are the traceback value (0 off, 1 on, 2 include system).const (  
    tracebackCrash = 1 << iota  
    tracebackAll    tracebackShift = iota  
)  
  
var traceback_cache uint32 = 2 << tracebackShift  
var traceback_env uint32  
  
// gotraceback returns the current traceback settings.//  
// If level is 0, suppress all tracebacks.  
// If level is 1, show tracebacks, but exclude runtime frames.  
// If level is 2, show tracebacks including runtime frames.  
// If all is set, print all goroutine stacks. Otherwise, print just the current goroutine.  
// If crash is set, crash (core dump, etc) after tracebacking.//  
//go:nosplit  
func gotraceback() (level int32, all, crash bool) {  
    gp := getg()  
    t := atomic.Load(&traceback_cache)  
    crash = t&tracebackCrash != 0  
    all = gp.m.throwing >= throwTypeUser || t&tracebackAll != 0  
    if gp.m.traceback != 0 {  
       level = int32(gp.m.traceback)  
    } else if gp.m.throwing >= throwTypeRuntime {  
       // Always include runtime frames in runtime throws unless  
       // otherwise overridden by m.traceback.       level = 2  
    } else {  
       level = int32(t >> tracebackShift)  
    }  
    return  
}

基於 GOTRACEBACK 打印堆棧信息#

// gp is the crashing g running on this M, but may be a user G, while getg() is  
// always g0.  
func dopanic_m(gp *g, pc, sp uintptr) bool {  
    if gp.sig != 0 {  
       signame := signame(gp.sig)  
       if signame != "" {  
          print("[signal ", signame)  
       } else {  
          print("[signal ", hex(gp.sig))  
       }  
       print(" code=", hex(gp.sigcode0), " addr=", hex(gp.sigcode1), " pc=", hex(gp.sigpc), "]\n")  
    }  
  
    level, all, docrash := gotraceback()  
    if level > 0 {  
       if gp != gp.m.curg {  
          all = true  
       }  
       if gp != gp.m.g0 {  
          print("\n")  
          goroutineheader(gp)  
          traceback(pc, sp, 0, gp)  
       } else if level >= 2 || gp.m.throwing >= throwTypeRuntime {  
          print("\nruntime stack:\n")  
          traceback(pc, sp, 0, gp)  
       }  
       if !didothers && all {  
          didothers = true  
          tracebackothers(gp)  
       }  
    }  
    unlock(&paniclk)  
  
    if panicking.Add(-1) != 0 {  
       // Some other m is panicking too.  
       // Let it print what it needs to print.       // Wait forever without chewing up cpu.       // It will exit when it's done.       lock(&deadlock)  
       lock(&deadlock)  
    }  
  
    printDebugLog()  
  
    return docrash  
}

測試#

package main  
  
import (  
    "fmt"  
    "os"    "time")  
  
func main() {  
  
    env := os.Getenv("GOTRACEBACK")  
    fmt.Printf("GOTRACEBACK: %s\n", env)  
  
    for i := 0; i < 3; i++ {  
       go a()  
    }  
    go b()  
    for i := 0; i < 3; i++ {  
       go a()  
    }  
  
    time.Sleep(time.Second * 1)  
}  
  
func a() {  
    time.Sleep(time.Millisecond * 1)  
    fmt.Printf("aaaaaaa\n")  
}  
  
func b() {  
    time.Sleep(time.Millisecond * 1)  
    panic("b panic ......")  
}