Data representation 4: Pointer and integer arithmetic

Overview

We explore library specifications and pointer, address, and integer representations, and introduce integer undefined behavior.

Arena allocator review

Example

struct node_arena {
    std::vector<node*> scratch_;

    node* allocate() {
        node* n;
        if (this->scratch_.empty()) {
            n = new node;
        } else {
            n = this->scratch_.back();
            this->scratch_.pop_back();
        }
        return n;
    }

    void deallocate(node* n) {
        this->scratch_.push_back(n);
    }
};

1.92x faster on Mac OS X (1.13s → 0.59s), 1.25x on Docker (0.51s → 0.41s)

Libraries and abstraction boundaries

struct node_arena {
    node* allocate();
    void deallocate(node* n);
    ~node_arena();
};

Allocator specification

struct node_arena {
    // Return a pointer to a freshly allocated `node`.
    node* allocate();

    // Deallocate `n`, which must refer to a live `node` that was
    // returned by a previous call to `allocate()`.
    void deallocate(node* n);

    // Free any storage used for allocated `node`s. All `node`s allocated
    // by this arena must be passed to `deallocate` before the arena
    // is destroyed.
    ~node_arena();
};

Specifications are partial

struct node_arena {
    node* allocate() {
        return new node;
    }
    void deallocate(node* n) {
        delete n;
    }
}
struct node_arena {
    std::vector<node*> scratch_;
    node* allocate() {
        node* n;
        if (scratch_.empty()) {
            n = new node;
        } else {
            n = scratch_.back();
            scratch_.pop_back();
        }
        return n;
    }
    void deallocate(node* n) {
        scratch_.push_back(n);
    }
    ~node_area() { ... }
}
struct node_arena {
    node* allocate() {
        return new node[10];
    }
    void deallocate(node* n) {
        delete[] n;
    }
}

Per-allocation statistics?

How to store per-allocation information

Internal metadata

struct statnode {
    node real_node;
    size_t uses = 0;
};

std::vector<statnode*> scratch_;

node* allocate() {
    statnode* n;
    if (this->scratch_.empty()) {
        n = new statnode;
    } else {
        n = this->scratch_.back();
        this->scratch_.pop_back();
    }
    ++n->uses;
    return (node*) n;
}

void deallocate(node* n) {
    this->scratch_.push_back((statnode*) n);
}

void print_uses() {
    for (auto n : this->scratch_) {
        printf("%zu uses @%p\n", n->uses, n);
    }
}

Why does this work?

struct statnode {
    node real_node;
    size_t uses = 0;
};

node* allocate() {
    statnode* n = ...;
    return (node*) n;
}

void deallocate(node* n) {
    this->scratch_.push_back((statnode*) n);
}

Pointer representation and address representation

Extra space first?

struct statnode {
    size_t uses = 0;
    node real_node;
};

Address computation

// `statnode` → `node`
statnode* sn = ...;
node* n = (node*) ((uintptr_t) sn + offsetof(statnode, real_node));

// `node` → `statnode`
node* n = ...;
statnode* n = (statnode*) ((uintptr_t) n - offsetof(statnode, real_node));

Pointer arithmetic

Pointer comparisons

T a[N];
int i, j;

T* p1 = &a[i];
T* p2 = &a[j];

p1 == p2                i == j
p1 != p2                i != j
p1 < p2                 i < j
p1 > p2                 i > j
p1 <= p2                i <= j
p1 >= p2                i >= j

Pointer addition and subtraction

T a[N];
int i, j, k;

T* p1 = &a[i];
T* p2 = &a[j];

p1 + k     ==     &a[i] + k     ==    &a[i + k]
p1 - k     ==     &a[i] - k     ==    &a[i - k]
p1 - p2    ==   &a[i] - &a[j]   ==    i - j        // type of `p1 - p2` is `ptrdiff_t` == `long`

p1 += k    ==    p1 = p1 + k    ==    p1 = &a[i + k]
++p1       ==    p1 = p1 + 1    ==    p1 = &a[i + 1]

Valid indexes

Pointer arithmetic vs. address arithmetic

T a[N];
int i, j;

T* p1 = &a[i];
T* p2 = &a[j];

uintptr_t diff1 = (uintptr_t) p1 - (uintptr_t) p2;
uintptr_t diff2 = (uintptr_t) (p1 - p2);

Which is bigger, diff1 or diff2?

Pointer arithmetic vs. address arithmetic (concretely)

int a[2] = {61, 62};

int* p1 = &a[1];
int* p2 = &a[0];

uintptr_t diff1 = (uintptr_t) p1 - (uintptr_t) p2;
uintptr_t diff2 = (uintptr_t) (p1 - p2);

printf("diff1=%zu, diff2=%zu, p1=%p, p2=%p\n", diff1, diff2, p1, p2);
hexdump_object(a);
$ ./ptrarith
diff1=4, diff2=1, p1=0x7ffee434dac4, p2=0x7ffee434dac0
7ffee434dac0  3d 00 00 00 3e 00 00 00                           |=...>...|

Cleaning up address computations

struct statnode {
    size_t uses = 0;
    node real_node;
};

// `statnode` → `node`
statnode* sn = ...;
node* n = &sn->real_node;

// `node` → `statnode`
static_assert(offsetof(statnode, real_node) == sizeof(size_t));
node* n = ...;
statnode* n = (statnode*) ((size_t*) n - 1);

Exercise: Figure out why this works!

Integer representation

#include <cstdio>
#include "hexdump.hh"

int main() {
    int i = 1;
    unsigned u = 1;

    hexdump_object(i);
    hexdump_object(u);
}

Multi-byte integers

int x = 258;
hexdump_object(x);

// => 7ffeea26fac8  02 01 00 00                                       |....|

Overflow

unsigned char n = 255;   // range representable in `n` is [0,255]
n = n + 1;               // `n` should be 256, but can’t be represented

unsigned u = 1'000'000;  // range representable in `u` is [0,4'294'967'295]
u = u * 10'000'000;      // `n` should be 10'000'000'000'000, but can’t be represented

Unsigned integer overflow

unsigned char n = 255;   // range representable in `n` is [0,255]
n = n + 1;               // `n` should be 256, but can’t be represented

unsigned u = 1'000'000;  // range representable in `u` is [0,4'294'967'295]
u = u * 10'000'000;      // `n` should be 10'000'000'000'000, but can’t be represented

printf("n=%u, u=%u\n", (unsigned) n, u);
// => n = 0, u = 1316134912
// (10'000'000'000'000 == (2**32 * 2328) + 1'316'134'912)

Two’s complement representation

Two’s complement and overflow

Example

unsigned char b = 1;
unsigned char negb = -1;
assert(negb == 255);

        b    ==     0b  0  0  0  0  0  0  0  1
   + negb    ==  +  0b  1  1  1  1  1  1  1  1
   ------        -----------------------------
        0        0b  1̸  0  0  0  0  0  0  0  0

Undefined behavior