Quickly locate your Panic using GOTRACEBACK

Recently, a Go project was migrated to a K8s machine, and it was found that the machine occasionally restarts automatically. When trying to view the logs before the restart, the status of all Goroutines was printed out. Due to the line limit in the company's cloud platform, in the end, the critical stack information for the panic was still not visible.

Initially, due to the frequent restarts, it was suspected that there was an uncaught panic somewhere. Therefore, the third-party packages used, such as RocketMQ and Talos SDK, were modified to include recover for production consumption initialization, and recover was also added to critical operations on channels in the project. However, this did not resolve the issue, and restarts still occurred from time to time.

While searching the Go official documentation, it was discovered that the environment variable GOTRACEBACK can be set to control the level of stack trace printing after a panic occurs.

GOTRACEBACK#

The Go runtime uses this environment variable to determine how much stack trace information should be output when the program crashes or an unhandled panic occurs. It is passed to the Go runtime via an environment variable when running a Go program.

none does not output any stack information when the program crashes;
single only shows the stack information of the goroutine that caused the crash and panic;
all shows the stack information of all goroutines;
system shows the stack information of all goroutines, including information about internal runtime goroutines;
crash shows the stack information of all goroutines, then core dumps the program and exits;

runtime package - runtime - Go Packages

Source Code#

Based on Go version 1.20

Setting GOTRACEBACK#

//go:linkname setTraceback runtime/debug.SetTraceback  
func setTraceback(level string) {  
    var t uint32  
    switch level {  
    case "none":  
       t = 0  
    case "single", "":  
       t = 1 << tracebackShift  
    case "all":  
       t = 1<<tracebackShift | tracebackAll  
    case "system":  
       t = 2<<tracebackShift | tracebackAll  
    case "crash":  
       t = 2<<tracebackShift | tracebackAll | tracebackCrash  
    default:  
       t = tracebackAll  
       if n, ok := atoi(level); ok && n == int(uint32(n)) {  
          t |= uint32(n) << tracebackShift  
       }  
    }  
    // when C owns the process, simply exit'ing the process on fatal errors  
    // and panics is surprising. Be louder and abort instead.    if islibrary || isarchive {  
       t |= tracebackCrash  
    }  
  
    t |= traceback_env  
  
    atomic.Store(&traceback_cache, t)  
}

Getting GOTRACEBACK#

// Keep a cached value to make gotraceback fast,// since we call it on every call to gentraceback.  
// The cached value is a uint32 in which the low bits  
// are the "crash" and "all" settings and the remaining  
// bits are the traceback value (0 off, 1 on, 2 include system).const (  
    tracebackCrash = 1 << iota  
    tracebackAll    tracebackShift = iota  
)  
  
var traceback_cache uint32 = 2 << tracebackShift  
var traceback_env uint32  
  
// gotraceback returns the current traceback settings.//  
// If level is 0, suppress all tracebacks.  
// If level is 1, show tracebacks, but exclude runtime frames.  
// If level is 2, show tracebacks including runtime frames.  
// If all is set, print all goroutine stacks. Otherwise, print just the current goroutine.  
// If crash is set, crash (core dump, etc) after tracebacking.//  
//go:nosplit  
func gotraceback() (level int32, all, crash bool) {  
    gp := getg()  
    t := atomic.Load(&traceback_cache)  
    crash = t&tracebackCrash != 0  
    all = gp.m.throwing >= throwTypeUser || t&tracebackAll != 0  
    if gp.m.traceback != 0 {  
       level = int32(gp.m.traceback)  
    } else if gp.m.throwing >= throwTypeRuntime {  
       // Always include runtime frames in runtime throws unless  
       // otherwise overridden by m.traceback.       level = 2  
    } else {  
       level = int32(t >> tracebackShift)  
    }  
    return  
}

Printing Stack Information Based on GOTRACEBACK#

// gp is the crashing g running on this M, but may be a user G, while getg() is  
// always g0.  
func dopanic_m(gp *g, pc, sp uintptr) bool {  
    if gp.sig != 0 {  
       signame := signame(gp.sig)  
       if signame != "" {  
          print("[signal ", signame)  
       } else {  
          print("[signal ", hex(gp.sig))  
       }  
       print(" code=", hex(gp.sigcode0), " addr=", hex(gp.sigcode1), " pc=", hex(gp.sigpc), "]\n")  
    }  
  
    level, all, docrash := gotraceback()  
    if level > 0 {  
       if gp != gp.m.curg {  
          all = true  
       }  
       if gp != gp.m.g0 {  
          print("\n")  
          goroutineheader(gp)  
          traceback(pc, sp, 0, gp)  
       } else if level >= 2 || gp.m.throwing >= throwTypeRuntime {  
          print("\nruntime stack:\n")  
          traceback(pc, sp, 0, gp)  
       }  
       if !didothers && all {  
          didothers = true  
          tracebackothers(gp)  
       }  
    }  
    unlock(&paniclk)  
  
    if panicking.Add(-1) != 0 {  
       // Some other m is panicking too.  
       // Let it print what it needs to print.       // Wait forever without chewing up cpu.       // It will exit when it's done.       lock(&deadlock)  
       lock(&deadlock)  
    }  
  
    printDebugLog()  
  
    return docrash  
}

Test#

package main  
  
import (  
    "fmt"  
    "os"    "time")  
  
func main() {  
  
    env := os.Getenv("GOTRACEBACK")  
    fmt.Printf("GOTRACEBACK: %s\n", env)  
  
    for i := 0; i < 3; i++ {  
       go a()  
    }  
    go b()  
    for i := 0; i < 3; i++ {  
       go a()  
    }  
  
    time.Sleep(time.Second * 1)  
}  
  
func a() {  
    time.Sleep(time.Millisecond * 1)  
    fmt.Printf("aaaaaaa\n")  
}  
  
func b() {  
    time.Sleep(time.Millisecond * 1)  
    panic("b panic ......")  
}