woensdag 28 september 2011

Useful back-traces in gcc

After some years of experience in Java programming. The first thing I noticed after getting my hands dirty with programming in ANSI-C was the lack of good stack tracing. Yes it is possible to fire up 'gdb' to get a stack trace on a address exception but I like to have a trace with function names and line numbers on any error situation in my program even for a lot of non fatal errors.

Ideally it should be possible for a main loop to continue even after some sub routine encountered a null pointer read but not after gathering info about what happened first. Yes many people think that it is better to stop the program completely after this kind of error but when writing high performance server software I don't like the idea that almost any error will take everything down.

So what are the building blocks of this solution:
  • There is a backtrace routine under <execinfo.h> that will give us a nice list of pointers from the current stack. There are symbols routines here also but they won't give us line numbers.
  • We have the awesome gnu command line tool addr2line that will give us our line numbers from pointers. Yes it's a separate program but this information gathering should be infrequent or else we have other problems to attend first. The program is also a bit too complex scanning binary files from all over the filing system for info about lines.
  • It is possible to try to handle signals normally but that will mess up our stack a bit.
  • But that is no problem for longjmp that will remember a sane stack position for us.
  • After handling SIGSEGV outside our main loop we can print a nice stack trace but we will lose our core file for closer inspection. That can be fixed with killing our program with kill though.
With these blocks we can build a solution. That gives use a Java like feel on those nasty exceptions.
The program is a bit long because of the feeding and reading from addr2line.

// Compile this example with: gcc -g -Wall loop.c -o loop

#include <sys/stat.h>
// to get the file length with 'fstat'
#include <unistd.h>
// for direct system access: 'write'/'read'/'close'/'unlink'/'getpid'
#include <stdlib.h>
// for 'mkstemp' / 'system' / 'exit'
#include <stdio.h>
// for 'printf' / 'fprintf' / 'getchar' / 'stderr'
#include <string.h>
// for 'strlen'
#include <execinfo.h>
// for 'backtrace'
#include <signal.h>
// for 'signal' / 'kill' / signal constants
#include <setjmp.h>
// for 'sigsetjmp' / 'siglongjmp'

char* binary;
// the file path to the current binary

int main_loop;
// boolean: are we inside the main loop for different signal effects

int interrupted;
// boolean: have we been interrupted by the user

jmp_buf environment;
// the saved environment just before the main loop

#define maxDepth 40

/**
    A handler for signals that shows a backtrace on the standard error.
    The signal parameter is given by the system or can be 0 when we just
    want to show a backtrace.
*/

void handler(int sig) {
    void *array[maxDepth];
    // room for the pointers on the current stack
    if (sig>0)
        fprintf(stderr, "\nCaught signal %i\n", sig);
    size_t size = backtrace(array, maxDepth);
    // the number of pointers found
    char tmpfile[] = "/tmp/addresses_XXXXXX";
    // file to write the addresses is for 'addr2line' to read
    char resultfile[] = "/tmp/result_XXXXXX";
    // file for the result of 'addr2line'
    int fp=mkstemp(tmpfile);
    // file pointer to the new opened temp file
    int rf=mkstemp(resultfile);
    // file pointer to the to be written result file
    char cmd[100+strlen(binary)];
    // enough space to hold the command and the shorter addresses
    int l;
    // skip the first address because that is always inside the handler
    for(l=1; l<size; l++) {
    // with -std=C99 on gcc this could have been 'for(int l=0; l<s ...'
        sprintf(cmd, "0x%x\n", (unsigned int)array[l]);
        if (write(fp, cmd, strlen(cmd))<0)
            fprintf(stderr, "Unable to write temporary file\n");
    }
    close(fp);
    sprintf(cmd, "/usr/bin/addr2line -p -f -s -i --exe %s @%s > %s",
        binary, tmpfile, resultfile);
    if (system(cmd))
        fprintf(stderr, "Install addr2line to get a readable backtrace");
    struct stat rfstat;
    // structure for the file information on the result file
    fstat(rf, &rfstat);
    char result[rfstat.st_size];
    // enough space to hold the complete result file, never big anyway
    read(rf, result, rfstat.st_size);
    close(rf);
    int lstart=0;
    // the position of the first character from the last line
    // or -1 when a line contains '??'
    for(l=0; l<rfstat.st_size; l++) { // loop through all characters
        if (result[l]=='?' && result[l+1]=='?')
            lstart=-1;
        if (result[l]==10) {
            result[l]=0; // write end of string over the end of line
            if (lstart!=-1)
                fprintf(stderr, "%s\n", result+lstart);
            lstart=l+1; // remember the start of the next line
        }
    } // we also ignore the last pointer, it is from the shell
    unlink(tmpfile); // tidy up the temp directory
    unlink(resultfile);
    if (sig==SIGINT) { // the user interrupted us
        interrupted=1;
    }
    if (sig==SIGSEGV) { // something nasty has happened
        if (main_loop) { // inside the main loop.. try to restart it
            siglongjmp(environment, 1);
        } else { // before or after the main loop perform a core dump
            signal(SIGSEGV, SIG_DFL);
            kill(getpid(), SIGSEGV);
        }
    }
}

/**
    A test function just to show a backtrace longer than a single line.
*/

void test() {
    int* pointer=0;
    // some null pointer to show a real-life segmentation fault
    printf("Hit ^C and Enter to stop the main loop\n");
    printf("or enter E to cause a null pointer exception\n\n");
    printf("Example call\n");
    handler(0);
    sigsetjmp(environment, 1); // remember the current state
    // the return value doesn't matter for this example
    // with 1 we could clean-up something here
    main_loop=1; // entering main loop..
    interrupted=0; // not yet
    while(!interrupted) {
        printf("Inside the main loop\n");
        if (getchar()=='E')
            *pointer=1; // obviously wrong
    }
    main_loop=0;
    printf("Finished the main loop\n");
}

/**
    The standard ANSI-C main function.
*/

int main(int argc, char* argv[]) {
    main_loop=0;
    binary=argv[0];
    signal(SIGSEGV, handler); // the two interrupts we are looking for
    signal(SIGINT, handler);
    test();
    printf("Create a core dump now\n");
    kill(getpid(), SIGSEGV);
    return 0;
}