diff options
author | daan <daanl@outlook.com> | 2020-03-16 16:41:21 -0700 |
---|---|---|
committer | daan <daanl@outlook.com> | 2020-03-16 16:41:21 -0700 |
commit | 1f396e64a09987704d7f4a9b56169d15fa85e7bd (patch) | |
tree | 4d7b3c5c16df578f57544622e5e5b6b2ab7037a1 /include/mimalloc-internal.h | |
parent | d221a4b9049344758e31850cf0f2716b5e6ff7e3 (diff) | |
parent | 980d343f391ceedb4f589f352c9bbb5155fd5f43 (diff) |
merge from dev
Diffstat (limited to 'include/mimalloc-internal.h')
-rw-r--r-- | include/mimalloc-internal.h | 222 |
1 files changed, 162 insertions, 60 deletions
diff --git a/include/mimalloc-internal.h b/include/mimalloc-internal.h index e27271b..e264751 100644 --- a/include/mimalloc-internal.h +++ b/include/mimalloc-internal.h @@ -10,10 +10,6 @@ terms of the MIT license. A copy of the license can be found in the file #include "mimalloc-types.h" -#if defined(MI_MALLOC_OVERRIDE) && (defined(__APPLE__) || defined(__OpenBSD__) || defined(__DragonFly__)) -#define MI_TLS_RECURSE_GUARD -#endif - #if (MI_DEBUG>0) #define mi_trace_message(...) _mi_trace_message(__VA_ARGS__) #else @@ -33,7 +29,7 @@ terms of the MIT license. A copy of the license can be found in the file #else #define mi_decl_noinline #define mi_decl_thread __thread // hope for the best :-) -#define mi_decl_cache_align +#define mi_decl_cache_align #endif @@ -51,6 +47,7 @@ void _mi_random_init(mi_random_ctx_t* ctx); void _mi_random_split(mi_random_ctx_t* ctx, mi_random_ctx_t* new_ctx); uintptr_t _mi_random_next(mi_random_ctx_t* ctx); uintptr_t _mi_heap_random_next(mi_heap_t* heap); +uintptr_t _os_random_weak(uintptr_t extra_seed); static inline uintptr_t _mi_random_shuffle(uintptr_t x); // init.c @@ -71,6 +68,7 @@ bool _mi_os_commit(void* addr, size_t size, bool* is_zero, mi_stats_t* stat bool _mi_os_decommit(void* p, size_t size, mi_stats_t* stats); bool _mi_os_reset(void* p, size_t size, mi_stats_t* stats); bool _mi_os_unreset(void* p, size_t size, bool* is_zero, mi_stats_t* stats); +size_t _mi_os_good_alloc_size(size_t size); // arena.c void* _mi_arena_alloc_aligned(size_t size, size_t alignment, bool* commit, bool* large, bool* is_zero, size_t* memid, mi_os_tld_t* tld); @@ -89,6 +87,8 @@ uint8_t* _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* void _mi_abandoned_reclaim_all(mi_heap_t* heap, mi_segments_tld_t* tld); void _mi_abandoned_await_readers(void); + + // "page.c" void* _mi_malloc_generic(mi_heap_t* heap, size_t size) mi_attr_noexcept mi_attr_malloc; @@ -238,9 +238,13 @@ static inline size_t _mi_wsize_from_size(size_t size) { return (size + sizeof(uintptr_t) - 1) / sizeof(uintptr_t); } +// Does malloc satisfy the alignment constraints already? +static inline bool mi_malloc_satisfies_alignment(size_t alignment, size_t size) { + return (alignment == sizeof(void*) || (alignment == MI_MAX_ALIGN_SIZE && size > (MI_MAX_ALIGN_SIZE/2))); +} // Overflow detecting multiply -static inline bool mi_mul_overflow(size_t count, size_t size, size_t* total) { +static inline bool mi_mul_overflow(size_t count, size_t size, size_t* total) { #if __has_builtin(__builtin_umul_overflow) || __GNUC__ >= 5 #include <limits.h> // UINT_MAX, ULONG_MAX #if (SIZE_MAX == UINT_MAX) @@ -273,26 +277,76 @@ static inline bool mi_count_size_overflow(size_t count, size_t size, size_t* tot } -/* ----------------------------------------------------------- - The thread local default heap ------------------------------------------------------------ */ +/* ---------------------------------------------------------------------------------------- +The thread local default heap: `_mi_get_default_heap` returns the thread local heap. +On most platforms (Windows, Linux, FreeBSD, NetBSD, etc), this just returns a +__thread local variable (`_mi_heap_default`). With the initial-exec TLS model this ensures +that the storage will always be available (allocated on the thread stacks). +On some platforms though we cannot use that when overriding `malloc` since the underlying +TLS implementation (or the loader) will call itself `malloc` on a first access and recurse. +We try to circumvent this in an efficient way: +- macOSX : we use an unused TLS slot from the OS allocated slots (MI_TLS_SLOT). On OSX, the + loader itself calls `malloc` even before the modules are initialized. +- OpenBSD: we use an unused slot from the pthread block (MI_TLS_PTHREAD_SLOT_OFS). +- DragonFly: not yet working. +------------------------------------------------------------------------------------------- */ extern const mi_heap_t _mi_heap_empty; // read-only empty heap, initial value of the thread local default heap -extern mi_heap_t _mi_heap_main; // statically allocated main backing heap extern bool _mi_process_is_initialized; +mi_heap_t* _mi_heap_main_get(void); // statically allocated main backing heap + +#if defined(MI_MALLOC_OVERRIDE) +#if defined(__MACH__) // OSX +#define MI_TLS_SLOT 89 // seems unused? +// other possible unused ones are 9, 29, __PTK_FRAMEWORK_JAVASCRIPTCORE_KEY4 (94), __PTK_FRAMEWORK_GC_KEY9 (112) and __PTK_FRAMEWORK_OLDGC_KEY9 (89) +// see <https://github.com/rweichler/substrate/blob/master/include/pthread_machdep.h> +#elif defined(__OpenBSD__) +// use end bytes of a name; goes wrong if anyone uses names > 23 characters (ptrhread specifies 16) +// see <https://github.com/openbsd/src/blob/master/lib/libc/include/thread_private.h#L371> +#define MI_TLS_PTHREAD_SLOT_OFS (6*sizeof(int) + 4*sizeof(void*) + 24) +#elif defined(__DragonFly__) +#warning "mimalloc is not working correctly on DragonFly yet." +#define MI_TLS_PTHREAD_SLOT_OFS (4 + 1*sizeof(void*)) // offset `uniqueid` (also used by gdb?) <https://github.com/DragonFlyBSD/DragonFlyBSD/blob/master/lib/libthread_xu/thread/thr_private.h#L458> +#endif +#endif +#if defined(MI_TLS_SLOT) +static inline void* mi_tls_slot(size_t slot) mi_attr_noexcept; // forward declaration +#elif defined(MI_TLS_PTHREAD_SLOT_OFS) +#include <pthread.h> +static inline mi_heap_t** mi_tls_pthread_heap_slot(void) { + pthread_t self = pthread_self(); + #if defined(__DragonFly__) + if (self==NULL) { + static mi_heap_t* pheap_main = _mi_heap_main_get(); + return &pheap_main; + } + #endif + return (mi_heap_t**)((uint8_t*)self + MI_TLS_PTHREAD_SLOT_OFS); +} +#elif defined(MI_TLS_PTHREAD) +#include <pthread.h> +extern pthread_key_t _mi_heap_default_key; +#else extern mi_decl_thread mi_heap_t* _mi_heap_default; // default heap to allocate from +#endif static inline mi_heap_t* mi_get_default_heap(void) { -#ifdef MI_TLS_RECURSE_GUARD - // on some BSD platforms, like macOS, the dynamic loader calls `malloc` - // to initialize thread local data. To avoid recursion, we need to avoid - // accessing the thread local `_mi_default_heap` until our module is loaded - // and use the statically allocated main heap until that time. - // TODO: patch ourselves dynamically to avoid this check every time? - if (!_mi_process_is_initialized) return &_mi_heap_main; -#endif +#if defined(MI_TLS_SLOT) + mi_heap_t* heap = (mi_heap_t*)mi_tls_slot(MI_TLS_SLOT); + return (mi_unlikely(heap == NULL) ? (mi_heap_t*)&_mi_heap_empty : heap); +#elif defined(MI_TLS_PTHREAD_SLOT_OFS) + mi_heap_t* heap = *mi_tls_pthread_heap_slot(); + return (mi_unlikely(heap == NULL) ? (mi_heap_t*)&_mi_heap_empty : heap); +#elif defined(MI_TLS_PTHREAD) + mi_heap_t* heap = (mi_unlikely(_mi_heap_default_key == (pthread_key_t)(-1)) ? _mi_heap_main_get() : (mi_heap_t*)pthread_getspecific(_mi_heap_default_key)); + return (mi_unlikely(heap == NULL) ? (mi_heap_t*)&_mi_heap_empty : heap); +#else + #if defined(MI_TLS_RECURSE_GUARD) + if (mi_unlikely(!_mi_process_is_initialized)) return _mi_heap_main_get(); + #endif return _mi_heap_default; +#endif } static inline bool mi_heap_is_default(const mi_heap_t* heap) { @@ -309,6 +363,8 @@ static inline bool mi_heap_is_initialized(mi_heap_t* heap) { } static inline uintptr_t _mi_ptr_cookie(const void* p) { + extern mi_heap_t _mi_heap_main; + mi_assert_internal(_mi_heap_main.cookie != 0); return ((uintptr_t)p ^ _mi_heap_main.cookie); } @@ -317,8 +373,10 @@ static inline uintptr_t _mi_ptr_cookie(const void* p) { ----------------------------------------------------------- */ static inline mi_page_t* _mi_heap_get_free_small_page(mi_heap_t* heap, size_t size) { - mi_assert_internal(size <= MI_SMALL_SIZE_MAX); - return heap->pages_free_direct[_mi_wsize_from_size(size)]; + mi_assert_internal(size <= (MI_SMALL_SIZE_MAX + MI_PADDING_SIZE)); + const size_t idx = _mi_wsize_from_size(size); + mi_assert_internal(idx < MI_PAGES_DIRECT); + return heap->pages_free_direct[idx]; } // Get the page belonging to a certain size class @@ -394,6 +452,13 @@ static inline size_t mi_page_block_size(const mi_page_t* page) { } } +// Get the usable block size of a page without fixed padding. +// This may still include internal padding due to alignment and rounding up size classes. +static inline size_t mi_page_usable_block_size(const mi_page_t* page) { + return mi_page_block_size(page) - MI_PADDING_SIZE; +} + + // Thread free access static inline mi_block_t* mi_page_thread_free(const mi_page_t* page) { return (mi_block_t*)(mi_atomic_read_relaxed(&page->xthread_free) & ~3); @@ -430,14 +495,14 @@ static inline mi_thread_free_t mi_tf_set_block(mi_thread_free_t tf, mi_block_t* return mi_tf_make(block, mi_tf_delayed(tf)); } -// are all blocks in a page freed? +// are all blocks in a page freed? // note: needs up-to-date used count, (as the `xthread_free` list may not be empty). see `_mi_page_collect_free`. static inline bool mi_page_all_free(const mi_page_t* page) { mi_assert_internal(page != NULL); return (page->used == 0); } -// are there any available blocks? +// are there any available blocks? static inline bool mi_page_has_any_available(const mi_page_t* page) { mi_assert_internal(page != NULL && page->reserved > 0); return (page->used < page->reserved || (mi_page_thread_free(page) != NULL)); @@ -485,11 +550,11 @@ static inline void mi_page_set_has_aligned(mi_page_t* page, bool has_aligned) { /* ------------------------------------------------------------------- Encoding/Decoding the free list next pointers -This is to protect against buffer overflow exploits where the -free list is mutated. Many hardened allocators xor the next pointer `p` +This is to protect against buffer overflow exploits where the +free list is mutated. Many hardened allocators xor the next pointer `p` with a secret key `k1`, as `p^k1`. This prevents overwriting with known -values but might be still too weak: if the attacker can guess -the pointer `p` this can reveal `k1` (since `p^k1^p == k1`). +values but might be still too weak: if the attacker can guess +the pointer `p` this can reveal `k1` (since `p^k1^p == k1`). Moreover, if multiple blocks can be read as well, the attacker can xor both as `(p1^k1) ^ (p2^k1) == p1^p2` which may reveal a lot about the pointers (and subsequently `k1`). @@ -497,9 +562,9 @@ about the pointers (and subsequently `k1`). Instead mimalloc uses an extra key `k2` and encodes as `((p^k2)<<<k1)+k1`. Since these operations are not associative, the above approaches do not work so well any more even if the `p` can be guesstimated. For example, -for the read case we can subtract two entries to discard the `+k1` term, +for the read case we can subtract two entries to discard the `+k1` term, but that leads to `((p1^k2)<<<k1) - ((p2^k2)<<<k1)` at best. -We include the left-rotation since xor and addition are otherwise linear +We include the left-rotation since xor and addition are otherwise linear in the lowest bit. Finally, both keys are unique per page which reduces the re-use of keys by a large factor. @@ -526,30 +591,37 @@ static inline uintptr_t mi_rotr(uintptr_t x, uintptr_t shift) { return ((x >> shift) | (x << (MI_INTPTR_BITS - shift))); } -static inline mi_block_t* mi_block_nextx( const void* null, const mi_block_t* block, uintptr_t key1, uintptr_t key2 ) { +static inline void* mi_ptr_decode(const void* null, const mi_encoded_t x, const uintptr_t* keys) { + void* p = (void*)(mi_rotr(x - keys[0], keys[0]) ^ keys[1]); + return (mi_unlikely(p==null) ? NULL : p); +} + +static inline mi_encoded_t mi_ptr_encode(const void* null, const void* p, const uintptr_t* keys) { + uintptr_t x = (uintptr_t)(mi_unlikely(p==NULL) ? null : p); + return mi_rotl(x ^ keys[1], keys[0]) + keys[0]; +} + +static inline mi_block_t* mi_block_nextx( const void* null, const mi_block_t* block, const uintptr_t* keys ) { #ifdef MI_ENCODE_FREELIST - mi_block_t* b = (mi_block_t*)(mi_rotr(block->next - key1, key1) ^ key2); - if (mi_unlikely((void*)b==null)) { b = NULL; } - return b; + return (mi_block_t*)mi_ptr_decode(null, block->next, keys); #else - UNUSED(key1); UNUSED(key2); UNUSED(null); + UNUSED(keys); UNUSED(null); return (mi_block_t*)block->next; #endif } -static inline void mi_block_set_nextx(const void* null, mi_block_t* block, const mi_block_t* next, uintptr_t key1, uintptr_t key2) { +static inline void mi_block_set_nextx(const void* null, mi_block_t* block, const mi_block_t* next, const uintptr_t* keys) { #ifdef MI_ENCODE_FREELIST - if (mi_unlikely(next==NULL)) { next = (mi_block_t*)null; } - block->next = mi_rotl((uintptr_t)next ^ key2, key1) + key1; + block->next = mi_ptr_encode(null, next, keys); #else - UNUSED(key1); UNUSED(key2); UNUSED(null); + UNUSED(keys); UNUSED(null); block->next = (mi_encoded_t)next; #endif } static inline mi_block_t* mi_block_next(const mi_page_t* page, const mi_block_t* block) { #ifdef MI_ENCODE_FREELIST - mi_block_t* next = mi_block_nextx(page,block,page->key[0],page->key[1]); + mi_block_t* next = mi_block_nextx(page,block,page->keys); // check for free list corruption: is `next` at least in the same page? // TODO: check if `next` is `page->block_size` aligned? if (mi_unlikely(next!=NULL && !mi_is_in_same_page(block, next))) { @@ -559,16 +631,16 @@ static inline mi_block_t* mi_block_next(const mi_page_t* page, const mi_block_t* return next; #else UNUSED(page); - return mi_block_nextx(page,block,0,0); + return mi_block_nextx(page,block,NULL); #endif } static inline void mi_block_set_next(const mi_page_t* page, mi_block_t* block, const mi_block_t* next) { #ifdef MI_ENCODE_FREELIST - mi_block_set_nextx(page,block,next, page->key[0], page->key[1]); + mi_block_set_nextx(page,block,next, page->keys); #else UNUSED(page); - mi_block_set_nextx(page,block, next,0,0); + mi_block_set_nextx(page,block,next,NULL); #endif } @@ -615,9 +687,8 @@ static inline size_t _mi_os_numa_node_count(void) { // ------------------------------------------------------------------- -// Getting the thread id should be performant -// as it is called in the fast path of `_mi_free`, -// so we specialize for various platforms. +// Getting the thread id should be performant as it is called in the +// fast path of `_mi_free` and we specialize for various platforms. // ------------------------------------------------------------------- #if defined(_WIN32) #define WIN32_LEAN_AND_MEAN @@ -626,24 +697,55 @@ static inline uintptr_t _mi_thread_id(void) mi_attr_noexcept { // Windows: works on Intel and ARM in both 32- and 64-bit return (uintptr_t)NtCurrentTeb(); } -#elif (defined(__GNUC__) || defined(__clang__)) && \ + +#elif defined(__GNUC__) && \ (defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__aarch64__)) -// TLS register on x86 is in the FS or GS register -// see: https://akkadia.org/drepper/tls.pdf + +// TLS register on x86 is in the FS or GS register, see: https://akkadia.org/drepper/tls.pdf +static inline void* mi_tls_slot(size_t slot) mi_attr_noexcept { + void* res; + const size_t ofs = (slot*sizeof(void*)); +#if defined(__i386__) + __asm__("movl %%gs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : ); // 32-bit always uses GS +#elif defined(__MACH__) && defined(__x86_64__) + __asm__("movq %%gs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : ); // x86_64 macOSX uses GS +#elif defined(__x86_64__) + __asm__("movq %%fs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : ); // x86_64 Linux, BSD uses FS +#elif defined(__arm__) + void** tcb; UNUSED(ofs); + asm volatile ("mrc p15, 0, %0, c13, c0, 3\nbic %0, %0, #3" : "=r" (tcb)); + res = tcb[slot]; +#elif defined(__aarch64__) + void** tcb; UNUSED(ofs); + asm volatile ("mrs %0, tpidr_el0" : "=r" (tcb)); + res = tcb[slot]; +#endif + return res; +} + +// setting is only used on macOSX for now +static inline void mi_tls_slot_set(size_t slot, void* value) mi_attr_noexcept { + const size_t ofs = (slot*sizeof(void*)); +#if defined(__i386__) + __asm__("movl %1,%%gs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : ); // 32-bit always uses GS +#elif defined(__MACH__) && defined(__x86_64__) + __asm__("movq %1,%%gs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : ); // x86_64 macOSX uses GS +#elif defined(__x86_64__) + __asm__("movq %1,%%fs:%1" : "=m" (*((void**)ofs)) : "rn" (value) : ); // x86_64 Linux, BSD uses FS +#elif defined(__arm__) + void** tcb; UNUSED(ofs); + asm volatile ("mrc p15, 0, %0, c13, c0, 3\nbic %0, %0, #3" : "=r" (tcb)); + tcb[slot] = value; +#elif defined(__aarch64__) + void** tcb; UNUSED(ofs); + asm volatile ("mrs %0, tpidr_el0" : "=r" (tcb)); + tcb[slot] = value; +#endif +} + static inline uintptr_t _mi_thread_id(void) mi_attr_noexcept { - uintptr_t tid; - #if defined(__i386__) - __asm__("movl %%gs:0, %0" : "=r" (tid) : : ); // 32-bit always uses GS - #elif defined(__MACH__) - __asm__("movq %%gs:0, %0" : "=r" (tid) : : ); // x86_64 macOS uses GS - #elif defined(__x86_64__) - __asm__("movq %%fs:0, %0" : "=r" (tid) : : ); // x86_64 Linux, BSD uses FS - #elif defined(__arm__) - asm volatile ("mrc p15, 0, %0, c13, c0, 3" : "=r" (tid)); - #elif defined(__aarch64__) - asm volatile ("mrs %0, tpidr_el0" : "=r" (tid)); - #endif - return tid; + // in all our targets, slot 0 is the pointer to the thread control block + return (uintptr_t)mi_tls_slot(0); } #else // otherwise use standard C |