perf runtime library


close(fd) does not completely reset

It appears that sometimes close (groupfd) does not completely reset the collection of Linux performance data collection. For example, when using using event groups to collect multiple counters.

The upshot is that setting up the perf run time library a second and subsequent times appear to use new Intel x86 hardware counters. This can quickly exhaust the supply of hardware counters. Resulting in read() returning counters without values, e.g. 0, but no error is reported.

Remove close()

If the same performance information is wanted again, rather than calling close(fd), reuse the perf event data and repeat the first collection sequence.

In the following example code, our add_perf_event() function is initially called several times (actually four times) with different parameters, together forming a perf event group. add_perf_event() is called once for each performance counter to be monitored.
   We then measure the performance of some software of interest multiple times (e.g. when it is used in a loop).
   Rather than close(fd) and repeat the sequence of add_perf_event() calls, we leave fd open, allowing us to re-use the perf runtime data we have set up, and only repeat the PERF_EVENT_IOC_RESET, PERF_EVENT_IOC_ENABLE code to be measured PERF_EVENT_IOC_DISABLE, sequence.
   Here we never call close(fd) and instead rely on Linux to clean up when the whole process exits.

//https://classes.engineering.wustl.edu/cse522/man-pages/perf_event_open.2.pdf
#include <errno.h>
#include <assert.h>
#include <unistd.h>
#include <sys/ioctl.h>
#include <linux/perf_event.h>
#include <asm/unistd.h>
enum {nr=4};
struct read_format {
  u64 nr;            /* The number of events, only if PERF_FORMAT_GROUP */
  u64 time_enabled;  /* only if PERF_FORMAT_TOTAL_TIME_ENABLED */
  u64 time_running;  /* only if PERF_FORMAT_TOTAL_TIME_RUNNING */
  struct {
    u64 value;     /* The value of the event */
    u64 id;        /* only if PERF_FORMAT_ID */
  } values[nr];
};
int add_perf_event(const int fd, const unsigned int type, const unsigned long long int config){
  struct perf_event_attr pe;
  memset(&pe, 0, sizeof(pe));
  pe.type = type;     //e.g. PERF_TYPE_HW_CACHE or PERF_TYPE_HARDWARE or PERF_TYPE_SOFTWARE ...
  pe.size = sizeof(pe);
  pe.config = config; //eg mask | PERF_COUNT_HW_CACHE_L1x or PERF_COUNT_HW_INSTRUCTIONS or PERF_COUNT_SW_CPU_CLOCK ...
  pe.disabled = (fd == -1)? 1 : 0;  //disable only on group leader, fd == -1
  pe.exclude_kernel = 1;
  pe.exclude_hv = 1;
  pe.read_format = PERF_FORMAT_GROUP; //| PERF_FORMAT_ID; //| PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING;

  const int ret = perf_event_open(&pe, 0, -1, fd, 0);
  if (ret == -1) {
    fprintf(stderr, "Error opening %d %llx\n", fd,config);
    exit(EXIT_FAILURE);
  }
  return (fd == -1)? ret : fd; //if first call, this is our group leader
}
int fd = -1;  //will be perf runtime library file descriptor, -1 says perf info not yet set up
  if(fd == -1) { //first time
    int f_; //error checking only
  
    //from cwperson..  L1D and LII good for read miss
    const int r = PERF_COUNT_HW_CACHE_RESULT_MISS;
    //const int r = PERF_COUNT_HW_CACHE_RESULT_ACCESS; also valid on L1D read and write
    const int o = PERF_COUNT_HW_CACHE_OP_READ;
    //const int t = PERF_COUNT_HW_CACHE_L1D;
    //const int t = PERF_COUNT_HW_CACHE_L1I;
    const unsigned long long mask = 0 | (o << 8) | (r <<16);
  
    fd = add_perf_event(fd, PERF_TYPE_HW_CACHE, mask | PERF_COUNT_HW_CACHE_L1D);
    assert(fd != -1);
    f_ = add_perf_event(fd, PERF_TYPE_HW_CACHE, mask | PERF_COUNT_HW_CACHE_L1I);
    assert(f_ == fd);
    f_ = add_perf_event(fd, PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS);
    assert(f_ == fd);
    f_ = add_perf_event(fd, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_CPU_CLOCK);
    assert(f_ == fd);
  }
  ioctl(fd, PERF_EVENT_IOC_RESET, 0);
  ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
  code to be measured
  ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
  struct read_format perf_data;
  memset(&perf_data,0,sizeof(perf_data));
  const int ret = read(fd, &perf_data, sizeof(perf_data));

  assert(errno == 0); //ENOSPC may mean buffer too small 
  assert(ret == sizeof(perf_data));
  NB dont //close(fd);

struct read_format size and structure depend on struct perf_event_attr

When using struct read_format with a group of events all the wanted perf counters can be read in a single read() call. However the size of the data returned depends upon many variable parts of the perf counters group, which is defined by repeated calls with different struct perf_event_attr (see above). e.g.

read() sets errno

Although it it is a good idea to check that the variable errno is zero (i.e. no error) after calling read, remember that errno is widely shared and a non-zero value may indicate an error prior to calling read.

Not all possible perf_event_attr config settings are valid and useful

The 64 bit config field in struct perf_event_attr above is a bit mask, with three eight bit fields: Giving 42 combinations. On Intel i7-4790 64 bit CPU only 20 combinations appear useful

mask 0PERF_COUNT_HW_CACHE_L1D PERF_COUNT_HW_CACHE_OP_READ PERF_COUNT_HW_CACHE_RESULT_ACCESS
mask 2PERF_COUNT_HW_CACHE_LL PERF_COUNT_HW_CACHE_OP_READ PERF_COUNT_HW_CACHE_RESULT_ACCESS
mask 3PERF_COUNT_HW_CACHE_DTLBPERF_COUNT_HW_CACHE_OP_READ PERF_COUNT_HW_CACHE_RESULT_ACCESS
mask 4PERF_COUNT_HW_CACHE_ITLBPERF_COUNT_HW_CACHE_OP_READ PERF_COUNT_HW_CACHE_RESULT_ACCESS
mask 5PERF_COUNT_HW_CACHE_BPU PERF_COUNT_HW_CACHE_OP_READ PERF_COUNT_HW_CACHE_RESULT_ACCESS
mask 6PERF_COUNT_HW_CACHE_NODEPERF_COUNT_HW_CACHE_OP_READ PERF_COUNT_HW_CACHE_RESULT_ACCESS
mask 256PERF_COUNT_HW_CACHE_L1D PERF_COUNT_HW_CACHE_OP_WRITEPERF_COUNT_HW_CACHE_RESULT_ACCESS
mask 258PERF_COUNT_HW_CACHE_LL PERF_COUNT_HW_CACHE_OP_WRITEPERF_COUNT_HW_CACHE_RESULT_ACCESS
mask 259PERF_COUNT_HW_CACHE_DTLBPERF_COUNT_HW_CACHE_OP_WRITEPERF_COUNT_HW_CACHE_RESULT_ACCESS
mask 262PERF_COUNT_HW_CACHE_NODEPERF_COUNT_HW_CACHE_OP_WRITEPERF_COUNT_HW_CACHE_RESULT_ACCESS
mask 65536PERF_COUNT_HW_CACHE_L1D PERF_COUNT_HW_CACHE_OP_READ PERF_COUNT_HW_CACHE_RESULT_MISS
mask 65537PERF_COUNT_HW_CACHE_L1I PERF_COUNT_HW_CACHE_OP_READ PERF_COUNT_HW_CACHE_RESULT_MISS
mask 65538PERF_COUNT_HW_CACHE_LL PERF_COUNT_HW_CACHE_OP_READ PERF_COUNT_HW_CACHE_RESULT_MISS
mask 65539PERF_COUNT_HW_CACHE_DTLBPERF_COUNT_HW_CACHE_OP_READ PERF_COUNT_HW_CACHE_RESULT_MISS
mask 65540PERF_COUNT_HW_CACHE_ITLBPERF_COUNT_HW_CACHE_OP_READ PERF_COUNT_HW_CACHE_RESULT_MISS
mask 65541PERF_COUNT_HW_CACHE_BPU PERF_COUNT_HW_CACHE_OP_READ PERF_COUNT_HW_CACHE_RESULT_MISS
mask 65542PERF_COUNT_HW_CACHE_NODEPERF_COUNT_HW_CACHE_OP_READ PERF_COUNT_HW_CACHE_RESULT_MISS
mask 65794PERF_COUNT_HW_CACHE_LL PERF_COUNT_HW_CACHE_OP_WRITEPERF_COUNT_HW_CACHE_RESULT_MISS
mask 65795PERF_COUNT_HW_CACHE_DTLBPERF_COUNT_HW_CACHE_OP_WRITEPERF_COUNT_HW_CACHE_RESULT_MISS
mask 65798PERF_COUNT_HW_CACHE_NODEPERF_COUNT_HW_CACHE_OP_WRITEPERF_COUNT_HW_CACHE_RESULT_MISS


read_format PERF_FORMAT_TOTAL_TIME_ENABLED and PERF_FORMAT_TOTAL_TIME_RUNNING

When PERF_FORMAT_TOTAL_TIME_ENABLED or PERF_FORMAT_TOTAL_TIME_RUNNING bits in read_format are set, this appears to enable measurements which only include when the process is active. That is, time when the process is waiting is not included. Thus they cannot be used to measure elapsed wall clock time.
   When used multiple times (e.g. in the loop described above) both PERF_FORMAT_TOTAL_TIME_ENABLED and PERF_FORMAT_TOTAL_TIME_RUNNING appear to give running totals, i.e. summing on previous values, rather than resetting to zero and starting again.
   In contrast L1 cache values appear to reset to zero.

read() may give ENOSPC if the supplied buffer is not big enough

If the buffer past to read() is not big enough errno may be set to ENOSPC rather than being zero.

struct perf_event_attr disabled set only on leader of perf group

It seems to be important that perf_event_attr's field disabled is set to 1 with the first (the leader) perf group and not set (i.e. set to 0) on all the other members of the group.
W.B.Langdon Back Started 24 January 2023.