/* * FastAlloc.h * * This source file is part of the FoundationDB open source project * * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef FLOW_FASTALLOC_H #define FLOW_FASTALLOC_H #pragma once #include "flow/Error.h" #include "flow/Platform.h" #include "flow/config.h" // ALLOC_INSTRUMENTATION_STDOUT enables non-sampled logging of all allocations and deallocations to stdout to be // processed by tools/alloc_instrumentation.py //#define ALLOC_INSTRUMENTATION_STDOUT ENABLED(NOT_IN_CLEAN) //#define ALLOC_INSTRUMENTATION ENABLED(NOT_IN_CLEAN) // The form "(1==1)" in this context is used to satisfy both clang and vc++ with a single syntax. Clang rejects "1" and // vc++ rejects "true". #define FASTALLOC_THREAD_SAFE (FLOW_THREAD_SAFE || (1 == 1)) #if VALGRIND #include <drd.h> #include <memcheck.h> bool valgrindPrecise(); #endif #include "flow/Hash3.h" #include <assert.h> #include <atomic> #include <vector> #include <cstdlib> #include <cstdio> #include <unordered_map> #if defined(ALLOC_INSTRUMENTATION) && defined(__linux__) #include <execinfo.h> #include <stdio.h> #include <stdlib.h> #include <unistd.h> #endif #ifdef ALLOC_INSTRUMENTATION #include <map> #include <algorithm> #include "flow/ThreadPrimitives.h" struct AllocInstrInfo { int64_t allocCount; int64_t deallocCount; int64_t maxAllocated; inline void alloc(int64_t count = 1) { allocCount += count; maxAllocated = std::max(allocCount - deallocCount, maxAllocated); } inline void dealloc(int64_t count = 1) { deallocCount += count; } }; extern std::map<const char*, AllocInstrInfo> allocInstr; #define INSTRUMENT_ALLOCATE(name) (allocInstr[(name)].alloc()) #define INSTRUMENT_RELEASE(name) (allocInstr[(name)].dealloc()) // extern std::map<uint32_t, uint64_t> stackAllocations; // maps from an address to the hash of the backtrace and the size of the alloction extern std::unordered_map<int64_t, std::pair<uint32_t, size_t>> memSample; struct BackTraceAccount { double count; size_t sampleCount; size_t totalSize; std::vector<void*>* backTrace; }; // maps from a hash of a backtrace to a backtrace and the total size of data currently allocated from this stack extern std::unordered_map<uint32_t, BackTraceAccount> backTraceLookup; extern ThreadSpinLock memLock; extern thread_local bool memSample_entered; extern const size_t SAMPLE_BYTES; #else #define INSTRUMENT_ALLOCATE(name) #define INSTRUMENT_RELEASE(name) #endif #if defined(ALLOC_INSTRUMENTATION) || defined(ALLOC_INSTRUMENTATION_STDOUT) void recordAllocation(void* ptr, size_t size); void recordDeallocation(void* ptr); #endif inline constexpr auto kFastAllocMagazineBytes = 128 << 10; template <int Size> class FastAllocator { public: [[nodiscard]] static void* allocate(); static void release(void* ptr); static void check(void* ptr, bool alloc); static long long getTotalMemory(); static long long getApproximateMemoryUnused(); static long long getActiveThreads(); #ifdef ALLOC_INSTRUMENTATION static volatile int32_t pageCount; #endif FastAllocator() = delete; private: #ifdef VALGRIND static unsigned long vLock; #endif static const int magazine_size = kFastAllocMagazineBytes / Size; static const int PSize = Size / sizeof(void*); struct GlobalData; struct ThreadData { void* freelist; int count; // there are count items on freelist void* alternate; // alternate is either a full magazine, or an empty one ThreadData(); ~ThreadData(); }; struct ThreadDataInit { ThreadDataInit() { threadData(); } }; // Used to try to initialize threadData as early as possible. It's still // possible that a static thread local variable (that owns fast-allocated // memory) could be constructed before threadData, in which case threadData // would be destroyed by the time that variable's destructor attempts to free. // This is undefined behavior if this happens, which is why we want to // initialize threadData as early as possible. static thread_local ThreadDataInit threadDataInit; // Used to access threadData. Returning a reference to a function-level // static guarantees that threadData will be constructed before it's // accessed here. Furthermore, if accessing threadData from a static thread // local variable's constructor, this guarantees that threadData will // outlive this object, since destruction order is the reverse of // construction order. static ThreadData& threadData() noexcept; static GlobalData* globalData() noexcept { #ifdef VALGRIND ANNOTATE_RWLOCK_ACQUIRED(vLock, 1); #endif static GlobalData* data = new GlobalData(); // This is thread-safe as of c++11 (VS 2015, gcc 4.8, clang 3.3) #ifdef VALGRIND ANNOTATE_RWLOCK_RELEASED(vLock, 1); #endif return data; } static void* freelist; static void getMagazine(); static void releaseMagazine(void*); }; extern std::atomic<int64_t> g_hugeArenaMemory; void hugeArenaSample(int size); void releaseAllThreadMagazines(); int64_t getTotalUnusedAllocatedMemory(); inline constexpr int nextFastAllocatedSize(int x) { assert(x > 0 && x <= 16384); if (x <= 16) return 16; else if (x <= 32) return 32; else if (x <= 64) return 64; else if (x <= 96) return 96; else if (x <= 128) return 128; else if (x <= 256) return 256; else if (x <= 512) return 512; else if (x <= 1024) return 1024; else if (x <= 2048) return 2048; else if (x <= 4096) return 4096; else if (x <= 8192) return 8192; else return 16384; } template <class Object> class FastAllocated { public: [[nodiscard]] static void* operator new(size_t s) { if (s != sizeof(Object)) abort(); INSTRUMENT_ALLOCATE(typeid(Object).name()); if constexpr (sizeof(Object) <= 256) { void* p = FastAllocator < sizeof(Object) <= 64 ? 64 : nextFastAllocatedSize(sizeof(Object)) > ::allocate(); return p; } else { void* p = new uint8_t[nextFastAllocatedSize(sizeof(Object))]; return p; } } static void operator delete(void* s) { INSTRUMENT_RELEASE(typeid(Object).name()); if constexpr (sizeof(Object) <= 256) { FastAllocator<sizeof(Object) <= 64 ? 64 : nextFastAllocatedSize(sizeof(Object))>::release(s); } else { delete[] reinterpret_cast<uint8_t*>(s); } } // Redefine placement new so you can still use it static void* operator new(size_t, void* p) { return p; } static void operator delete(void*, void*) {} }; [[nodiscard]] inline void* allocateFast(int size) { if (size <= 16) return FastAllocator<16>::allocate(); if (size <= 32) return FastAllocator<32>::allocate(); if (size <= 64) return FastAllocator<64>::allocate(); if (size <= 96) return FastAllocator<96>::allocate(); if (size <= 128) return FastAllocator<128>::allocate(); if (size <= 256) return FastAllocator<256>::allocate(); return new uint8_t[size]; } inline void freeFast(int size, void* ptr) { if (size <= 16) return FastAllocator<16>::release(ptr); if (size <= 32) return FastAllocator<32>::release(ptr); if (size <= 64) return FastAllocator<64>::release(ptr); if (size <= 96) return FastAllocator<96>::release(ptr); if (size <= 128) return FastAllocator<128>::release(ptr); if (size <= 256) return FastAllocator<256>::release(ptr); delete[](uint8_t*) ptr; } // Allocate a block of memory aligned to 4096 bytes. Size must be a multiple of // 4096. Guaranteed not to return null. Use freeFast4kAligned to free. [[nodiscard]] inline void* allocateFast4kAligned(int size) { #if !defined(USE_JEMALLOC) // Use FastAllocator for sizes it supports to avoid internal fragmentation in some implementations of aligned_alloc if (size <= 4096) return FastAllocator<4096>::allocate(); if (size <= 8192) return FastAllocator<8192>::allocate(); if (size <= 16384) return FastAllocator<16384>::allocate(); #endif auto* result = aligned_alloc(4096, size); if (result == nullptr) { platform::outOfMemory(); } return result; } // Free a pointer returned from allocateFast4kAligned(size) inline void freeFast4kAligned(int size, void* ptr) { #if !defined(USE_JEMALLOC) // Sizes supported by FastAllocator must be release via FastAllocator if (size <= 4096) return FastAllocator<4096>::release(ptr); if (size <= 8192) return FastAllocator<8192>::release(ptr); if (size <= 16384) return FastAllocator<16384>::release(ptr); #endif aligned_free(ptr); } #endif