`perf` runtime library

`close(fd)` does not completely reset

It appears that sometimes close (groupfd) does not completely reset the collection of Linux performance data collection. For example, when using using event groups to collect multiple counters.

The upshot is that setting up the perf run time library a second and subsequent times appear to use new Intel x86 hardware counters. This can quickly exhaust the supply of hardware counters. Resulting in read() returning counters without values, e.g. 0, but no error is reported.

Remove `close()`

If the same performance information is wanted again, rather than calling close(fd), reuse the perf event data and repeat the first collection sequence.

In the following example code, our add_perf_event() function is initially called several times (actually four times) with different parameters, together forming a perf event group. add_perf_event() is called once for each performance counter to be monitored.
   We then measure the performance of some software of interest multiple times (e.g. when it is used in a loop).
   Rather than close(fd) and repeat the sequence of add_perf_event() calls, we leave fd open, allowing us to re-use the perf runtime data we have set up, and only repeat the PERF_EVENT_IOC_RESET, PERF_EVENT_IOC_ENABLE code to be measured PERF_EVENT_IOC_DISABLE, sequence.
   Here we never call close(fd) and instead rely on Linux to clean up when the whole process exits.

//https://classes.engineering.wustl.edu/cse522/man-pages/perf_event_open.2.pdf
#include <errno.h>
#include <assert.h>
#include <unistd.h>
#include <sys/ioctl.h>
#include <linux/perf_event.h>
#include <asm/unistd.h>

enum {nr=4};
struct read_format {
  u64 nr;            /* The number of events, only if PERF_FORMAT_GROUP */
  u64 time_enabled;  /* only if PERF_FORMAT_TOTAL_TIME_ENABLED */
  u64 time_running;  /* only if PERF_FORMAT_TOTAL_TIME_RUNNING */
  struct {
    u64 value;     /* The value of the event */
    u64 id;        /* only if PERF_FORMAT_ID */
  } values[nr];
};

int add_perf_event(const int fd, const unsigned int type, const unsigned long long int config){
  struct perf_event_attr pe;
  memset(&pe, 0, sizeof(pe));
  pe.type = type;     //e.g. PERF_TYPE_HW_CACHE or PERF_TYPE_HARDWARE or PERF_TYPE_SOFTWARE ...
  pe.size = sizeof(pe);
  pe.config = config; //eg mask | PERF_COUNT_HW_CACHE_L1x or PERF_COUNT_HW_INSTRUCTIONS or PERF_COUNT_SW_CPU_CLOCK ...
  pe.disabled = (fd == -1)? 1 : 0;  //disable only on group leader, fd == -1
  pe.exclude_kernel = 1;
  pe.exclude_hv = 1;
  pe.read_format = PERF_FORMAT_GROUP; //| PERF_FORMAT_ID; //| PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING;

  const int ret = perf_event_open(&pe, 0, -1, fd, 0);
  if (ret == -1) {
    fprintf(stderr, "Error opening %d %llx\n", fd,config);
    exit(EXIT_FAILURE);
  }
  return (fd == -1)? ret : fd; //if first call, this is our group leader
}

int fd = -1;  //will be perf runtime library file descriptor, -1 says perf info not yet set up

  if(fd == -1) { //first time
    int f_; //error checking only
  
    //from cwperson..  L1D and LII good for read miss
    const int r = PERF_COUNT_HW_CACHE_RESULT_MISS;
    //const int r = PERF_COUNT_HW_CACHE_RESULT_ACCESS; also valid on L1D read and write
    const int o = PERF_COUNT_HW_CACHE_OP_READ;
    //const int t = PERF_COUNT_HW_CACHE_L1D;
    //const int t = PERF_COUNT_HW_CACHE_L1I;
    const unsigned long long mask = 0 | (o << 8) | (r <<16);
  
    fd = add_perf_event(fd, PERF_TYPE_HW_CACHE, mask | PERF_COUNT_HW_CACHE_L1D);
    assert(fd != -1);
    f_ = add_perf_event(fd, PERF_TYPE_HW_CACHE, mask | PERF_COUNT_HW_CACHE_L1I);
    assert(f_ == fd);
    f_ = add_perf_event(fd, PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS);
    assert(f_ == fd);
    f_ = add_perf_event(fd, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_CPU_CLOCK);
    assert(f_ == fd);
  }

  ioctl(fd, PERF_EVENT_IOC_RESET, 0);
  ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
  code to be measured
  ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);

  struct read_format perf_data;
  memset(&perf_data,0,sizeof(perf_data));
  const int ret = read(fd, &perf_data, sizeof(perf_data));

  assert(errno == 0); //ENOSPC may mean buffer too small 
  assert(ret == sizeof(perf_data));
  NB dont //close(fd);

`struct read_format` size and structure depend on `struct perf_event_attr`

When using struct read_format with a group of events all the wanted perf counters can be read in a single read() call. However the size of the data returned depends upon many variable parts of the perf counters group, which is defined by repeated calls with different struct perf_event_attr (see above). e.g.

read_format Did we set bits for PERF_FORMAT_GROUP, PERF_FORMAT_ID, PERF_FORMAT_TOTAL_TIME_ENABLED or PERF_FORMAT_TOTAL_TIME_RUNNING
nr the number of events in the group

`read()` sets `errno`

Although it it is a good idea to check that the variable errno is zero (i.e. no error) after calling read, remember that errno is widely shared and a non-zero value may indicate an error prior to calling read.

Not all possible `perf_event_attr config` settings are valid and useful

The 64 bit config field in struct perf_event_attr above is a bit mask, with three eight bit fields:

0..7 One of 7 values, eg PERF_COUNT_HW_CACHE_L1F, PERF_COUNT_HW_CACHE_L1I, ...
8..15 One of 3 values, eg PERF_COUNT_HW_CACHE_OP_READ
16..23 Either PERF_COUNT_HW_CACHE_RESULT_MISS or PERF_COUNT_HW_CACHE_RESULT_ACCESS

Giving 42 combinations. On Intel i7-4790 64 bit CPU only 20 combinations appear useful

mask 0 PERF_COUNT_HW_CACHE_L1D PERF_COUNT_HW_CACHE_OP_READ PERF_COUNT_HW_CACHE_RESULT_ACCESS

mask 2 PERF_COUNT_HW_CACHE_LL PERF_COUNT_HW_CACHE_OP_READ PERF_COUNT_HW_CACHE_RESULT_ACCESS

mask 3 PERF_COUNT_HW_CACHE_DTLB PERF_COUNT_HW_CACHE_OP_READ PERF_COUNT_HW_CACHE_RESULT_ACCESS

mask 4 PERF_COUNT_HW_CACHE_ITLB PERF_COUNT_HW_CACHE_OP_READ PERF_COUNT_HW_CACHE_RESULT_ACCESS

mask 5 PERF_COUNT_HW_CACHE_BPU PERF_COUNT_HW_CACHE_OP_READ PERF_COUNT_HW_CACHE_RESULT_ACCESS

mask 6 PERF_COUNT_HW_CACHE_NODE PERF_COUNT_HW_CACHE_OP_READ PERF_COUNT_HW_CACHE_RESULT_ACCESS

mask 256 PERF_COUNT_HW_CACHE_L1D PERF_COUNT_HW_CACHE_OP_WRITE PERF_COUNT_HW_CACHE_RESULT_ACCESS

mask 258 PERF_COUNT_HW_CACHE_LL PERF_COUNT_HW_CACHE_OP_WRITE PERF_COUNT_HW_CACHE_RESULT_ACCESS

mask 259 PERF_COUNT_HW_CACHE_DTLB PERF_COUNT_HW_CACHE_OP_WRITE PERF_COUNT_HW_CACHE_RESULT_ACCESS

mask 262 PERF_COUNT_HW_CACHE_NODE PERF_COUNT_HW_CACHE_OP_WRITE PERF_COUNT_HW_CACHE_RESULT_ACCESS

mask 65536 PERF_COUNT_HW_CACHE_L1D PERF_COUNT_HW_CACHE_OP_READ PERF_COUNT_HW_CACHE_RESULT_MISS

mask 65537 PERF_COUNT_HW_CACHE_L1I PERF_COUNT_HW_CACHE_OP_READ PERF_COUNT_HW_CACHE_RESULT_MISS

mask 65538 PERF_COUNT_HW_CACHE_LL PERF_COUNT_HW_CACHE_OP_READ PERF_COUNT_HW_CACHE_RESULT_MISS

mask 65539 PERF_COUNT_HW_CACHE_DTLB PERF_COUNT_HW_CACHE_OP_READ PERF_COUNT_HW_CACHE_RESULT_MISS

mask 65540 PERF_COUNT_HW_CACHE_ITLB PERF_COUNT_HW_CACHE_OP_READ PERF_COUNT_HW_CACHE_RESULT_MISS

mask 65541 PERF_COUNT_HW_CACHE_BPU PERF_COUNT_HW_CACHE_OP_READ PERF_COUNT_HW_CACHE_RESULT_MISS

mask 65542 PERF_COUNT_HW_CACHE_NODE PERF_COUNT_HW_CACHE_OP_READ PERF_COUNT_HW_CACHE_RESULT_MISS

mask 65794 PERF_COUNT_HW_CACHE_LL PERF_COUNT_HW_CACHE_OP_WRITE PERF_COUNT_HW_CACHE_RESULT_MISS

mask 65795 PERF_COUNT_HW_CACHE_DTLB PERF_COUNT_HW_CACHE_OP_WRITE PERF_COUNT_HW_CACHE_RESULT_MISS

mask 65798 PERF_COUNT_HW_CACHE_NODE PERF_COUNT_HW_CACHE_OP_WRITE PERF_COUNT_HW_CACHE_RESULT_MISS

`read_format` `PERF_FORMAT_TOTAL_TIME_ENABLED` and `PERF_FORMAT_TOTAL_TIME_RUNNING`

When PERF_FORMAT_TOTAL_TIME_ENABLED or PERF_FORMAT_TOTAL_TIME_RUNNING bits in read_format are set, this appears to enable measurements which only include when the process is active. That is, time when the process is waiting is not included. Thus they cannot be used to measure elapsed wall clock time.
When used multiple times (e.g. in the loop described above) both PERF_FORMAT_TOTAL_TIME_ENABLED and PERF_FORMAT_TOTAL_TIME_RUNNING appear to give running totals, i.e. summing on previous values, rather than resetting to zero and starting again.
In contrast L1 cache values appear to reset to zero.

`read()` may give `ENOSPC` if the supplied buffer is not big enough

If the buffer past to read() is not big enough errno may be set to ENOSPC rather than being zero.

`struct perf_event_attr disabled` set only on leader of perf group

It seems to be important that perf_event_attr's field disabled is set to 1 with the first (the leader) perf group and not set (i.e. set to 0) on all the other members of the group.

W.B.Langdon Back Started 24 January 2023.

mask 0	PERF_COUNT_HW_CACHE_L1D	PERF_COUNT_HW_CACHE_OP_READ	PERF_COUNT_HW_CACHE_RESULT_ACCESS
mask 2	PERF_COUNT_HW_CACHE_LL	PERF_COUNT_HW_CACHE_OP_READ	PERF_COUNT_HW_CACHE_RESULT_ACCESS
mask 3	PERF_COUNT_HW_CACHE_DTLB	PERF_COUNT_HW_CACHE_OP_READ	PERF_COUNT_HW_CACHE_RESULT_ACCESS
mask 4	PERF_COUNT_HW_CACHE_ITLB	PERF_COUNT_HW_CACHE_OP_READ	PERF_COUNT_HW_CACHE_RESULT_ACCESS
mask 5	PERF_COUNT_HW_CACHE_BPU	PERF_COUNT_HW_CACHE_OP_READ	PERF_COUNT_HW_CACHE_RESULT_ACCESS
mask 6	PERF_COUNT_HW_CACHE_NODE	PERF_COUNT_HW_CACHE_OP_READ	PERF_COUNT_HW_CACHE_RESULT_ACCESS
mask 256	PERF_COUNT_HW_CACHE_L1D	PERF_COUNT_HW_CACHE_OP_WRITE	PERF_COUNT_HW_CACHE_RESULT_ACCESS
mask 258	PERF_COUNT_HW_CACHE_LL	PERF_COUNT_HW_CACHE_OP_WRITE	PERF_COUNT_HW_CACHE_RESULT_ACCESS
mask 259	PERF_COUNT_HW_CACHE_DTLB	PERF_COUNT_HW_CACHE_OP_WRITE	PERF_COUNT_HW_CACHE_RESULT_ACCESS
mask 262	PERF_COUNT_HW_CACHE_NODE	PERF_COUNT_HW_CACHE_OP_WRITE	PERF_COUNT_HW_CACHE_RESULT_ACCESS
mask 65536	PERF_COUNT_HW_CACHE_L1D	PERF_COUNT_HW_CACHE_OP_READ	PERF_COUNT_HW_CACHE_RESULT_MISS
mask 65537	PERF_COUNT_HW_CACHE_L1I	PERF_COUNT_HW_CACHE_OP_READ	PERF_COUNT_HW_CACHE_RESULT_MISS
mask 65538	PERF_COUNT_HW_CACHE_LL	PERF_COUNT_HW_CACHE_OP_READ	PERF_COUNT_HW_CACHE_RESULT_MISS
mask 65539	PERF_COUNT_HW_CACHE_DTLB	PERF_COUNT_HW_CACHE_OP_READ	PERF_COUNT_HW_CACHE_RESULT_MISS
mask 65540	PERF_COUNT_HW_CACHE_ITLB	PERF_COUNT_HW_CACHE_OP_READ	PERF_COUNT_HW_CACHE_RESULT_MISS
mask 65541	PERF_COUNT_HW_CACHE_BPU	PERF_COUNT_HW_CACHE_OP_READ	PERF_COUNT_HW_CACHE_RESULT_MISS
mask 65542	PERF_COUNT_HW_CACHE_NODE	PERF_COUNT_HW_CACHE_OP_READ	PERF_COUNT_HW_CACHE_RESULT_MISS
mask 65794	PERF_COUNT_HW_CACHE_LL	PERF_COUNT_HW_CACHE_OP_WRITE	PERF_COUNT_HW_CACHE_RESULT_MISS
mask 65795	PERF_COUNT_HW_CACHE_DTLB	PERF_COUNT_HW_CACHE_OP_WRITE	PERF_COUNT_HW_CACHE_RESULT_MISS
mask 65798	PERF_COUNT_HW_CACHE_NODE	PERF_COUNT_HW_CACHE_OP_WRITE	PERF_COUNT_HW_CACHE_RESULT_MISS

perf runtime library

close(fd) does not completely reset

Remove close()

struct read_format size and structure depend on struct perf_event_attr

read() sets errno

Not all possible perf_event_attr config settings are valid and useful

read_format PERF_FORMAT_TOTAL_TIME_ENABLED and PERF_FORMAT_TOTAL_TIME_RUNNING

read() may give ENOSPC if the supplied buffer is not big enough

struct perf_event_attr disabled set only on leader of perf group