src/memory.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450

/* ----------------------------------------------------------------------------
Copyright (c) 2019, Microsoft Research, Daan Leijen
This is free software; you can redistribute it and/or modify it under the
terms of the MIT license. A copy of the license can be found in the file
"LICENSE" at the root of this distribution.
-----------------------------------------------------------------------------*/

/* ----------------------------------------------------------------------------
This implements a layer between the raw OS memory (VirtualAlloc/mmap/sbrk/..)
and the segment and huge object allocation by mimalloc. There may be multiple
implementations of this (one could be the identity going directly to the OS,
another could be a simple cache etc), but the current one uses large "regions".
In contrast to the rest of mimalloc, the "regions" are shared between threads and
need to be accessed using atomic operations.
We need this memory layer between the raw OS calls because of:
1. on `sbrk` like systems (like WebAssembly) we need our own memory maps in order
   to reuse memory effectively.
2. It turns out that for large objects, between 1MiB and 32MiB (?), the cost of
   an OS allocation/free is still (much) too expensive relative to the accesses in that
   object :-( (`mallloc-large` tests this). This means we need a cheaper way to
   reuse memory.
3. This layer can help with a NUMA aware allocation in the future.

Possible issues:
- (2) can potentially be addressed too with a small cache per thread which is much
  simpler. Generally though that requires shrinking of huge pages, and may overuse
  memory per thread. (and is not compatible with `sbrk`).
- Since the current regions are per-process, we need atomic operations to
  claim blocks which may be contended
- In the worst case, we need to search the whole region map (16KiB for 256GiB)
  linearly. At what point will direct OS calls be faster? Is there a way to
  do this better without adding too much complexity?
-----------------------------------------------------------------------------*/
#include "mimalloc.h"
#include "mimalloc-internal.h"
#include "mimalloc-atomic.h"

#include <string.h>  // memset

// Internal raw OS interface
size_t  _mi_os_large_page_size();
bool  _mi_os_protect(void* addr, size_t size);
bool  _mi_os_unprotect(void* addr, size_t size);
bool  _mi_os_commit(void* p, size_t size, mi_stats_t* stats);
bool  _mi_os_decommit(void* p, size_t size, mi_stats_t* stats);
bool  _mi_os_reset(void* p, size_t size, mi_stats_t* stats);
bool  _mi_os_unreset(void* p, size_t size, mi_stats_t* stats);
void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, mi_os_tld_t* tld);


// Constants
#if (MI_INTPTR_SIZE==8)
#define MI_HEAP_REGION_MAX_SIZE    (256 * (1ULL << 30))  // 256GiB => 16KiB for the region map
#elif (MI_INTPTR_SIZE==4)
#define MI_HEAP_REGION_MAX_SIZE    (3 * (1UL << 30))    // 3GiB => 196 bytes for the region map
#else
#error "define the maximum heap space allowed for regions on this platform"
#endif

#define MI_SEGMENT_ALIGN          MI_SEGMENT_SIZE

#define MI_REGION_MAP_BITS        (MI_INTPTR_SIZE * 8)
#define MI_REGION_SIZE            (MI_SEGMENT_SIZE * MI_REGION_MAP_BITS)
#define MI_REGION_MAX_ALLOC_SIZE  ((MI_REGION_MAP_BITS/4)*MI_SEGMENT_SIZE)  // 64MiB
#define MI_REGION_MAX             (MI_HEAP_REGION_MAX_SIZE / MI_REGION_SIZE)
#define MI_REGION_MAP_FULL        UINTPTR_MAX


// A region owns a chunk of REGION_SIZE (256MiB) (virtual) memory with
// a bit map with one bit per MI_SEGMENT_SIZE (4MiB) block.
typedef struct mem_region_s {
  volatile uintptr_t map;    // in-use bit per MI_SEGMENT_SIZE block
  volatile void*     start;  // start of virtual memory area
} mem_region_t;


// The region map; 16KiB for a 256GiB HEAP_REGION_MAX
// TODO: in the future, maintain a map per NUMA node for numa aware allocation
static mem_region_t regions[MI_REGION_MAX];

static volatile size_t regions_count = 0;        // allocated regions


/* ----------------------------------------------------------------------------
Utility functions
-----------------------------------------------------------------------------*/

// Blocks (of 4MiB) needed for the given size.
static size_t mi_region_block_count(size_t size) {
  mi_assert_internal(size <= MI_REGION_MAX_ALLOC_SIZE);
  return (size + MI_SEGMENT_SIZE - 1) / MI_SEGMENT_SIZE;
}

// The bit mask for a given number of blocks at a specified bit index.
static uintptr_t mi_region_block_mask(size_t blocks, size_t bitidx) {
  mi_assert_internal(blocks + bitidx <= MI_REGION_MAP_BITS);
  return ((((uintptr_t)1 << blocks) - 1) << bitidx);
}

// Return a rounded commit/reset size such that we don't fragment large OS pages into small ones.
static size_t mi_good_commit_size(size_t size) {
  if (size > (SIZE_MAX - _mi_os_large_page_size())) return size;
  return _mi_align_up(size, _mi_os_large_page_size());
}

// Return if a pointer points into a region reserved by us.
bool mi_is_in_heap_region(const void* p) mi_attr_noexcept {
  if (p==NULL) return false;
  size_t count = mi_atomic_read(&regions_count);
  for (size_t i = 0; i < count; i++) {
    uint8_t* start = (uint8_t*)mi_atomic_read_ptr(&regions[i].start);
    if (start != NULL && (uint8_t*)p >= start && (uint8_t*)p < start + MI_REGION_SIZE) return true;
  }
  return false;
}


/* ----------------------------------------------------------------------------
Commit from a region
-----------------------------------------------------------------------------*/

// Commit the `blocks` in `region` at `idx` and `bitidx` of a given `size`.
// Returns `false` on an error (OOM); `true` otherwise. `p` and `id` are only written
// if the blocks were successfully claimed so ensure they are initialized to NULL/SIZE_MAX before the call.
// (not being able to claim is not considered an error so check for `p != NULL` afterwards).
static bool mi_region_commit_blocks(mem_region_t* region, size_t idx, size_t bitidx, size_t blocks, size_t size, bool commit, void** p, size_t* id, mi_os_tld_t* tld)
{
  size_t mask = mi_region_block_mask(blocks,bitidx);
  mi_assert_internal(mask != 0);
  mi_assert_internal((mask & mi_atomic_read(&region->map)) == mask);

  // ensure the region is reserved
  void* start = mi_atomic_read_ptr(&region->start);
  if (start == NULL) 
  {
    start = _mi_os_alloc_aligned(MI_REGION_SIZE, MI_SEGMENT_ALIGN, mi_option_is_enabled(mi_option_eager_region_commit), tld);    
    if (start == NULL) {
      // failure to allocate from the OS! unclaim the blocks and fail
      size_t map;
      do {
        map = mi_atomic_read(&region->map);
      } while (!mi_atomic_compare_exchange(&region->map, map & ~mask, map));
      return false;
    }

    // set the newly allocated region
    if (mi_atomic_compare_exchange_ptr(&region->start, start, NULL)) {
      // update the region count
      mi_atomic_increment(&regions_count);
    }
    else {
      // failed, another thread allocated just before us, free our allocated memory
      // TODO: should we keep the allocated memory and assign it to some other region?
      _mi_os_free(start, MI_REGION_SIZE, tld->stats);
      start = mi_atomic_read_ptr(&region->start);
    }
  }
  mi_assert_internal(start == mi_atomic_read_ptr(&region->start));
  mi_assert_internal(start != NULL);

  // Commit the blocks to memory
  void* blocks_start = (uint8_t*)start + (bitidx * MI_SEGMENT_SIZE);
  if (commit && !mi_option_is_enabled(mi_option_eager_region_commit)) {
    _mi_os_commit(blocks_start, mi_good_commit_size(size), tld->stats);  // only commit needed size (unless using large OS pages)
  }

  // and return the allocation
  *p  = blocks_start;
  *id = (idx*MI_REGION_MAP_BITS) + bitidx;
  return true;
}

// Use bit scan forward to quickly find the first zero bit if it is available
#if defined(_MSC_VER)
#define MI_HAVE_BITSCAN
#include <intrin.h>
static inline size_t mi_bsf(uintptr_t x) {
  if (x==0) return 8*MI_INTPTR_SIZE;
  DWORD idx;
  #if (MI_INTPTR_SIZE==8)
  _BitScanForward64(&idx, x);
  #else
  _BitScanForward(&idx, x);
  #endif
  return idx;
}
static inline size_t mi_bsr(uintptr_t x) {
  if (x==0) return 8*MI_INTPTR_SIZE;
  DWORD idx;
  #if (MI_INTPTR_SIZE==8)
  _BitScanReverse64(&idx, x);
  #else
  _BitScanReverse(&idx, x);
  #endif
  return idx;
}
#elif defined(__GNUC__) || defined(__clang__)
#define MI_HAVE_BITSCAN
static inline size_t mi_bsf(uintptr_t x) {
  return (x==0 ? 8*MI_INTPTR_SIZE : __builtin_ctzl(x));
}
static inline size_t mi_bsr(uintptr_t x) {
  return (x==0 ? 8*MI_INTPTR_SIZE : (8*MI_INTPTR_SIZE - 1) - __builtin_clzl(x));
}
#endif

// Allocate `blocks` in a `region` at `idx` of a given `size`.
// Returns `false` on an error (OOM); `true` otherwise. `p` and `id` are only written
// if the blocks were successfully claimed so ensure they are initialized to NULL/SIZE_MAX before the call.
// (not being able to claim is not considered an error so check for `p != NULL` afterwards).
static bool mi_region_alloc_blocks(mem_region_t* region, size_t idx, size_t blocks, size_t size, bool commit, void** p, size_t* id, mi_os_tld_t* tld)
{
  mi_assert_internal(p != NULL && id != NULL);
  mi_assert_internal(blocks < MI_REGION_MAP_BITS);

  const uintptr_t mask = mi_region_block_mask(blocks, 0);
  const size_t bitidx_max = MI_REGION_MAP_BITS - blocks;
  uintptr_t map = mi_atomic_read(&region->map);

  #ifdef MI_HAVE_BITSCAN
  size_t bitidx = mi_bsf(~map);    // quickly find the first zero bit if possible
  #else
  size_t bitidx = 0;               // otherwise start at 0
  #endif
  uintptr_t m = (mask << bitidx);     // invariant: m == mask shifted by bitidx

  // scan linearly for a free range of zero bits
  while(bitidx <= bitidx_max) {
    if ((map & m) == 0) {  // are the mask bits free at bitidx?
      mi_assert_internal((m >> bitidx) == mask); // no overflow?
      uintptr_t newmap = map | m;
      mi_assert_internal((newmap^map) >> bitidx == mask);
      if (!mi_atomic_compare_exchange(&region->map, newmap, map)) {
        // no success, another thread claimed concurrently.. keep going
        map = mi_atomic_read(&region->map);
        continue;
      }
      else {
        // success, we claimed the bits
        // now commit the block memory -- this can still fail
        return mi_region_commit_blocks(region, idx, bitidx, blocks, size, commit, p, id, tld);
      }
    }
    else {
      // on to the next bit range
      #ifdef MI_HAVE_BITSCAN
      size_t shift = (blocks == 1 ? 1 : mi_bsr(map & m) - bitidx + 1);
      mi_assert_internal(shift > 0 && shift <= blocks);
      #else
      size_t shift = 1;
      #endif
      bitidx += shift;
      m <<= shift;
    }
  }
  // no error, but also no bits found
  return true;
}

// Try to allocate `blocks` in a `region` at `idx` of a given `size`. Does a quick check before trying to claim.
// Returns `false` on an error (OOM); `true` otherwise. `p` and `id` are only written
// if the blocks were successfully claimed so ensure they are initialized to NULL/0 before the call.
// (not being able to claim is not considered an error so check for `p != NULL` afterwards).
static bool mi_region_try_alloc_blocks(size_t idx, size_t blocks, size_t size, bool commit, void** p, size_t* id, mi_os_tld_t* tld)
{
  // check if there are available blocks in the region..
  mi_assert_internal(idx < MI_REGION_MAX);
  mem_region_t* region = &regions[idx];
  uintptr_t m = mi_atomic_read(&region->map);
  if (m != MI_REGION_MAP_FULL) {  // some bits are zero
    return mi_region_alloc_blocks(region, idx, blocks, size, commit, p, id, tld);
  }
  else {
    return true;  // no error, but no success either
  }
}

/* ----------------------------------------------------------------------------
 Allocation
-----------------------------------------------------------------------------*/

// Allocate `size` memory aligned at `alignment`. Return non NULL on success, with a given memory `id`.
// (`id` is abstract, but `id = idx*MI_REGION_MAP_BITS + bitidx`)
void* _mi_mem_alloc_aligned(size_t size, size_t alignment, bool commit, size_t* id, mi_os_tld_t* tld)
{
  mi_assert_internal(id != NULL && tld != NULL);
  mi_assert_internal(size > 0);
  *id = SIZE_MAX;

  // use direct OS allocation for huge blocks or alignment (with `id = SIZE_MAX`)
  if (size > MI_REGION_MAX_ALLOC_SIZE || alignment > MI_SEGMENT_ALIGN) {
    return _mi_os_alloc_aligned(mi_good_commit_size(size), alignment, true, tld);  // round up size
  }

  // always round size to OS page size multiple (so commit/decommit go over the entire range)
  // TODO: use large OS page size here?
  size = _mi_align_up(size, _mi_os_page_size());

  // calculate the number of needed blocks
  size_t blocks = mi_region_block_count(size);
  mi_assert_internal(blocks > 0 && blocks <= 8*MI_INTPTR_SIZE);

  // find a range of free blocks
  void* p = NULL;
  size_t count = mi_atomic_read(&regions_count);
  size_t idx = tld->region_idx; // start index is per-thread to reduce contention
  for (size_t visited = 0; visited < count; visited++, idx++) {
    if (idx >= count) idx = 0;  // wrap around
    if (!mi_region_try_alloc_blocks(idx, blocks, size, commit, &p, id, tld)) return NULL; // error
    if (p != NULL) break;
  }

  if (p == NULL) {
    // no free range in existing regions -- try to extend beyond the count.. but at most 4 regions
    for (idx = count; idx < count + 4 && idx < MI_REGION_MAX; idx++) {
      if (!mi_region_try_alloc_blocks(idx, blocks, size, commit, &p, id, tld)) return NULL; // error
      if (p != NULL) break;
    }
  }

  if (p == NULL) {
    // we could not find a place to allocate, fall back to the os directly
    p = _mi_os_alloc_aligned(size, alignment, commit, tld);
  }
  else {
    tld->region_idx = idx;  // next start of search
  }

  mi_assert_internal( p == NULL || (uintptr_t)p % alignment == 0);
  return p;
}


// Allocate `size` memory. Return non NULL on success, with a given memory `id`.
void* _mi_mem_alloc(size_t size, bool commit, size_t* id, mi_os_tld_t* tld) {
  return _mi_mem_alloc_aligned(size,0,commit,id,tld);
}

/* ----------------------------------------------------------------------------
Free
-----------------------------------------------------------------------------*/

// Free previously allocated memory with a given id.
void _mi_mem_free(void* p, size_t size, size_t id, mi_stats_t* stats) {
  mi_assert_internal(size > 0 && stats != NULL);
  if (p==NULL) return;
  if (size==0) return;
  if (id == SIZE_MAX) {
   // was a direct OS allocation, pass through
    _mi_os_free(p, size, stats);
  }
  else {
    // allocated in a region
    mi_assert_internal(size <= MI_REGION_MAX_ALLOC_SIZE); if (size > MI_REGION_MAX_ALLOC_SIZE) return;
    // we can align the size up to page size (as we allocate that way too)
    // this ensures we fully commit/decommit/reset
    size = _mi_align_up(size, _mi_os_page_size());
    size_t idx = (id / MI_REGION_MAP_BITS);
    size_t bitidx = (id % MI_REGION_MAP_BITS);
    size_t blocks = mi_region_block_count(size);
    size_t mask = mi_region_block_mask(blocks, bitidx);
    mi_assert_internal(idx < MI_REGION_MAX); if (idx >= MI_REGION_MAX) return; // or `abort`?
    mem_region_t* region = &regions[idx];
    mi_assert_internal((mi_atomic_read(&region->map) & mask) == mask ); // claimed?
    void* start = mi_atomic_read_ptr(&region->start);
    mi_assert_internal(start != NULL);
    void* blocks_start = (uint8_t*)start + (bitidx * MI_SEGMENT_SIZE);
    mi_assert_internal(blocks_start == p); // not a pointer in our area?
    mi_assert_internal(bitidx + blocks <= MI_REGION_MAP_BITS);
    if (blocks_start != p || bitidx + blocks > MI_REGION_MAP_BITS) return; // or `abort`?

    // decommit (or reset) the blocks to reduce the working set.
    // TODO: implement delayed decommit/reset as these calls are too expensive
    // if the memory is reused soon.
    // reset: 10x slowdown on malloc-large, decommit: 17x slowdown on malloc-large
    if (!mi_option_is_enabled(mi_option_large_os_pages)) {
      if (mi_option_is_enabled(mi_option_eager_region_commit)) {
        //_mi_os_reset(p, size, stats);
      }
      else {
        //_mi_os_decommit(p, size, stats);
      }
    }

    // TODO: should we free empty regions? currently only done _mi_mem_collect.
    // this frees up virtual address space which
    // might be useful on 32-bit systems?

    // and unclaim
    uintptr_t map;
    uintptr_t newmap;
    do {
      map = mi_atomic_read(&region->map);
      newmap = map & ~mask;
    } while (!mi_atomic_compare_exchange(&region->map, newmap, map));
  }
}


/* ----------------------------------------------------------------------------
  collection
-----------------------------------------------------------------------------*/
void _mi_mem_collect(mi_stats_t* stats) {
  // free every region that has no segments in use.
  for (size_t i = 0; i < regions_count; i++) {
    mem_region_t* region = &regions[i];
    if (mi_atomic_read(&region->map) == 0 && region->start != NULL) {
      // if no segments used, try to claim the whole region
      uintptr_t m;
      do {
        m = mi_atomic_read(&region->map);
      } while(m == 0 && !mi_atomic_compare_exchange(&region->map, ~((uintptr_t)0), 0 ));
      if (m == 0) {
        // on success, free the whole region
        if (region->start != NULL) _mi_os_free((void*)region->start, MI_REGION_SIZE, stats);
        // and release
        region->start = 0;
        mi_atomic_write(&region->map,0);
      }
    }
  }
}

/* ----------------------------------------------------------------------------
  Other
-----------------------------------------------------------------------------*/

bool _mi_mem_commit(void* p, size_t size, mi_stats_t* stats) {
  return _mi_os_commit(p, size, stats);
}

bool _mi_mem_decommit(void* p, size_t size, mi_stats_t* stats) {
  return _mi_os_decommit(p, size, stats);
}

bool _mi_mem_reset(void* p, size_t size, mi_stats_t* stats) {
  return _mi_os_reset(p, size, stats);
}

bool _mi_mem_unreset(void* p, size_t size, mi_stats_t* stats) {
  return _mi_os_unreset(p, size, stats);
}

bool _mi_mem_protect(void* p, size_t size) {
  return _mi_os_protect(p, size);
}

bool _mi_mem_unprotect(void* p, size_t size) {
  return _mi_os_unprotect(p, size);
}