Before everyone had a multitude of computers of their own computers were rare, and if you wanted to use one you had to share it.

Given the demand for computers exceeded the supply, people had to share time using it.

Initially you could do this with a stop watch, but it's better for the computer itself to be able to measure this time since as computers became more complicated:

  1. Pre-emption, where one process can be interrupted to run another, means the time take up by a program isn't just the difference between when the program started and ended.

  2. Multi-threading, where a program can run multiple commands simultaneously, means you can use CPU time at a rate of more than one CPU second per second.

Computers became so pervasive that most computer users don't need to share, but virtual server providers also need to account for time used, and CPU time can also be used to measure how long it takes to perform an operation for profiling purposes so when a program is slow you know which part is the most worth your time to optimise.

Getting CPU time

The CPU time is read in the same way as other clocks, with different clock IDs for each process or thread.

  1. Current process with CLOCK_PROCESS_CPUTIME_ID.

    int ret = clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &time);
    
  2. Current thread with CLOCK_THREAD_CPUTIME_ID.

    int ret = clock_gettime(CLOCK_THREAD_CPUTIME_ID, &time);
    
  3. Another process with clock_getcpuclockid(3).

    int pid_gettime(pid_t pid, struct timespec *tp) {
        int ret;
        clockid_t clockid;
        ret = clock_getcpuclockid(pid, &clockid);
        if (ret != 0) {
            return ret;
        }
        ret = clock_gettime(clockid, tp);
        return ret;
    }
    
  4. Another thread with pthread_getcpuclockid(3).

    int thread_gettime(pthread_t thread, struct timespec *tp) {
        int ret;
        clockid_t clockid;
        ret = pthread_getcpuclockid(thread, &clockid);
        if (ret != 0) {
            return ret;
        }
        ret = clock_gettime(clockid, tp);
        return ret;
    }
    

See gettime.c for an example program for reading the times, and Makefile for build instructions.

Profiling

We can instrument code (see profile-unthreaded.c) to see how much time a section takes to run.

#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#include <time.h>

static void print_time(FILE *f, struct timespec time) {
    fprintf(f, "%lld.%09lld\n", (long long)time.tv_sec, (long long)time.tv_nsec);
}

int main(int argc, char **argv) {
    enum {
        ITERATIONS = 1000000,
    };
    int ret, exit = 0;
    struct timespec start, end;

    ret = clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);
    if (ret != 0) {
        perror("clock_gettime");
        exit = 1;
        goto exit;
    }

    for (int i = 0; i < ITERATIONS; i++) {
        fprintf(stdout, "% 7d\n", i);
    }

    ret = clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end);
    if (ret != 0) {
        perror("clock_gettime");
        exit = 1;
        goto exit;
    }

    end.tv_sec -= start.tv_sec;
    end.tv_nsec -= start.tv_nsec;
    if (end.tv_nsec < 0) {
        end.tv_sec--;
        end.tv_sec += 1000000000l;
    }

    print_time(stderr, end);

exit:
    return exit;
}
$ make profile-unthreaded
$ ./profile-unthreaded >/tmp/f
0.073965395

We can make use of threads to try to speed this up (see profile-threaded.c).

#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#include <time.h>
#include <unistd.h>

#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*x))

static void print_time(FILE *f, struct timespec time) {
    fprintf(f, "%lld.%09lld\n", (long long)time.tv_sec, (long long)time.tv_nsec);
}

struct thread_args {
    int fd;
    int start;
    unsigned len;
};

void *thread_run(void *_thread_args) {
    struct thread_args *thread_args = _thread_args;
    char buf[9];
    for (int i = thread_args->start;
         i < thread_args->start + thread_args->len; i++) {
        ssize_t len = snprintf(buf, ARRAY_SIZE(buf), "% 7d\n", i);
        pwrite(thread_args->fd, buf, len, i * len);
    }
    return NULL;
}

int main(int argc, char **argv) {
    enum {
        ITERATIONS = 1000000,
        THREADS = 4,
    };
    int i, ret, exit = 0;
    struct timespec start, end;
    pthread_t threads[THREADS];
    struct thread_args thread_args[THREADS];

    ret = clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);
    if (ret != 0) {
        perror("clock_gettime");
        exit = 1;
        goto exit;
    }

    for (i = 0; i < ARRAY_SIZE(threads); i++) {
        thread_args[i].fd = 1;
        thread_args[i].start = ITERATIONS / THREADS * i;
        thread_args[i].len = ITERATIONS / THREADS;
        ret = pthread_create(&threads[i], NULL, thread_run,
                             &thread_args[i]);
        if (ret != 0) {
            perror("pthread_create");
            exit = 1;
        }
    }
    if (exit != 0) {
        for (; i >= 0; i--) {
            (void) pthread_cancel(threads[i]);
        }
        goto exit;
    }
    
    for (i = 0; i < ARRAY_SIZE(threads); i++) {
        ret = pthread_join(threads[i], NULL);
        if (ret != 0) {
            perror("pthread_join");
            exit = 1;
        }
    }
    if (exit != 0) {
        goto exit;
    }

    ret = clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end);
    if (ret != 0) {
        perror("clock_gettime");
        exit = 1;
        goto exit;
    }

    end.tv_sec -= start.tv_sec;
    end.tv_nsec -= start.tv_nsec;
    if (end.tv_nsec < 0) {
        end.tv_sec--;
        end.tv_sec += 1000000000l;
    }

    print_time(stderr, end);

exit:
    return 0;
}
$ make profile-threaded
$ ./profile-threaded >/tmp/f
3.185380729

By instrumenting we can tell that this actually made this section a lot slower.

Don't do this

Manually instrumenting things is a lot of work which means you are only going to do it for bits you already suspect are slow.

GCC's -pg adds instrumentation to dump times in a format readable by gprof.

valgrind when invoked like valgrind \-\-tool=callgrind prog and kcachegrind to view it. This runs your program on an emulated CPU, so it can use its own model of how long an operation takes for accounting time so it is unaffected by the overhead of profiling.

perf makes use of CPU features to measure with minimum overhead.

make CFLAGS=-ggdb command
perf record \-\-call-graph=dwarf ./command
perf report