foundationdb/flow/FastAlloc.cpp
Jingyu Zhou 396b10caca Add memory profiling for FastAlloc when gperftool is used
FastAlloc is the major memory use case in FDB, yet we can't profiling its usage.
This commit replaces FastAlloc memory allocation with malloc so that we may
track its memory usage when gperftool is used.
2019-10-07 19:27:06 -07:00

549 lines
16 KiB
C++

/*
* FastAlloc.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "flow/FastAlloc.h"
#include "flow/ThreadPrimitives.h"
#include "flow/Trace.h"
#include "flow/Error.h"
#include "flow/Knobs.h"
#include "flow/flow.h"
#include <cstdint>
#include <unordered_map>
#ifdef WIN32
#include <windows.h>
#undef min
#undef max
#endif
#ifdef __linux__
#include <sys/mman.h>
#include <linux/mman.h>
#endif
#define FAST_ALLOCATOR_DEBUG 0
#ifdef _MSC_VER
// warning 4073 warns about "initializers put in library initialization area", which is our intent
#pragma warning (disable: 4073)
#pragma init_seg(lib)
#define INIT_SEG
#elif defined(__GNUG__)
#ifdef __linux__
#define INIT_SEG __attribute__ ((init_priority (1000)))
#elif defined(__APPLE__)
#pragma message "init_priority is not supported on this platform; will this be a problem?"
#define INIT_SEG
#else
#error Where am I?
#endif
#else
#error Port me? (init_seg(lib))
#endif
template<int Size>
INIT_SEG thread_local typename FastAllocator<Size>::ThreadData FastAllocator<Size>::threadData;
template<int Size>
thread_local bool FastAllocator<Size>::threadInitialized = false;
#ifdef VALGRIND
template<int Size>
unsigned long FastAllocator<Size>::vLock = 1;
#endif
template<int Size>
void* FastAllocator<Size>::freelist = nullptr;
typedef void (*ThreadInitFunction)();
ThreadInitFunction threadInitFunction = 0; // See ThreadCleanup.cpp in the C binding
void setFastAllocatorThreadInitFunction( ThreadInitFunction f ) {
ASSERT( !threadInitFunction );
threadInitFunction = f;
}
std::atomic<int64_t> g_hugeArenaMemory(0);
double hugeArenaLastLogged = 0;
std::map<std::string, std::pair<int,int>> hugeArenaTraces;
void hugeArenaSample(int size) {
if(TraceEvent::isNetworkThread()) {
auto& info = hugeArenaTraces[platform::get_backtrace()];
info.first++;
info.second+=size;
if(now() - hugeArenaLastLogged > FLOW_KNOBS->HUGE_ARENA_LOGGING_INTERVAL) {
for(auto& it : hugeArenaTraces) {
TraceEvent("HugeArenaSample").detail("Count", it.second.first).detail("Size", it.second.second).detail("Backtrace", it.first);
}
hugeArenaLastLogged = now();
hugeArenaTraces.clear();
}
}
}
#ifdef ALLOC_INSTRUMENTATION
INIT_SEG std::map<const char*, AllocInstrInfo> allocInstr;
INIT_SEG std::unordered_map<int64_t, std::pair<uint32_t, size_t>> memSample;
INIT_SEG std::unordered_map<uint32_t, BackTraceAccount> backTraceLookup;
INIT_SEG ThreadSpinLock memLock;
const size_t SAMPLE_BYTES = 1e7;
template<int Size>
volatile int32_t FastAllocator<Size>::pageCount;
thread_local bool memSample_entered = false;
#endif
#ifdef ALLOC_INSTRUMENTATION_STDOUT
thread_local bool inRecordAllocation = false;
#endif
void recordAllocation( void *ptr, size_t size ) {
#ifdef ALLOC_INSTRUMENTATION_STDOUT
if( inRecordAllocation )
return;
inRecordAllocation = true;
std::string trace = platform::get_backtrace();
printf("Alloc\t%p\t%d\t%s\n", ptr, size, trace.c_str());
inRecordAllocation = false;
#endif
#ifdef ALLOC_INSTRUMENTATION
if( memSample_entered )
return;
memSample_entered = true;
if(((double)rand()) / RAND_MAX < ((double)size) / SAMPLE_BYTES) {
void *buffer[100];
#if defined(__linux__)
int nptrs = backtrace( buffer, 100 );
#elif defined(_WIN32)
// We could be using fourth parameter to get a hash, but we'll do this
// in a unified way between platforms
int nptrs = CaptureStackBackTrace( 1, 100, buffer, NULL );
#else
#error Instrumentation not supported on this platform
#endif
uint32_t a = 0, b = 0;
if( nptrs > 0 ) {
hashlittle2( buffer, nptrs * sizeof(void *), &a, &b );
}
double countDelta = std::max(1.0, ((double)SAMPLE_BYTES) / size);
size_t sizeDelta = std::max(SAMPLE_BYTES, size);
ThreadSpinLockHolder holder( memLock );
auto it = backTraceLookup.find( a );
if( it == backTraceLookup.end() ) {
auto& bt = backTraceLookup[ a ];
bt.backTrace = new std::vector<void*>();
for (int j = 0; j < nptrs; j++) {
bt.backTrace->push_back( buffer[j] );
}
bt.totalSize = sizeDelta;
bt.count = countDelta;
bt.sampleCount = 1;
} else {
it->second.totalSize += sizeDelta;
it->second.count += countDelta;
it->second.sampleCount++;
}
memSample[(int64_t)ptr] = std::make_pair(a, size);
}
memSample_entered = false;
#endif
}
void recordDeallocation( void *ptr ) {
#ifdef ALLOC_INSTRUMENTATION_STDOUT
if( inRecordAllocation )
return;
printf("Dealloc\t%p\n", ptr);
inRecordAllocation = false;
#endif
#ifdef ALLOC_INSTRUMENTATION
if( memSample_entered ) // could this lead to deallocations not being recorded?
return;
memSample_entered = true;
{
ThreadSpinLockHolder holder( memLock );
auto it = memSample.find( (int64_t)ptr );
if( it == memSample.end() ) {
memSample_entered = false;
return;
}
auto bti = backTraceLookup.find( it->second.first );
ASSERT( bti != backTraceLookup.end() );
size_t sizeDelta = std::max(SAMPLE_BYTES, it->second.second);
double countDelta = std::max(1.0, ((double)SAMPLE_BYTES) / it->second.second);
bti->second.totalSize -= sizeDelta;
bti->second.count -= countDelta;
bti->second.sampleCount--;
memSample.erase( it );
}
memSample_entered = false;
#endif
}
template <int Size>
struct FastAllocator<Size>::GlobalData {
CRITICAL_SECTION mutex;
std::vector<void*> magazines; // These magazines are always exactly magazine_size ("full")
std::vector<std::pair<int, void*>> partial_magazines; // Magazines that are not "full" and their counts. Only created by releaseThreadMagazines().
long long totalMemory;
long long partialMagazineUnallocatedMemory;
long long activeThreads;
GlobalData() : totalMemory(0), partialMagazineUnallocatedMemory(0), activeThreads(0) {
InitializeCriticalSection(&mutex);
}
};
template <int Size>
long long FastAllocator<Size>::getTotalMemory() {
return globalData()->totalMemory;
}
// This does not include memory held by various threads that's available for allocation
template <int Size>
long long FastAllocator<Size>::getApproximateMemoryUnused() {
return globalData()->magazines.size() * magazine_size * Size + globalData()->partialMagazineUnallocatedMemory;
}
template <int Size>
long long FastAllocator<Size>::getActiveThreads() {
return globalData()->activeThreads;
}
#if FAST_ALLOCATOR_DEBUG
static int64_t getSizeCode(int i) {
switch (i) {
case 16: return 1;
case 32: return 2;
case 64: return 3;
case 96: return 4;
case 128: return 5;
case 256: return 6;
case 512: return 7;
case 1024: return 8;
case 2048: return 9;
case 4096: return 10;
case 8192: return 11;
default: return 12;
}
}
#endif
template<int Size>
void *FastAllocator<Size>::allocate() {
if(!threadInitialized) {
initThread();
}
#ifdef USE_GPERFTOOLS
return malloc(Size);
#endif
#if FASTALLOC_THREAD_SAFE
ThreadData& thr = threadData;
if (!thr.freelist) {
ASSERT(thr.count == 0);
if (thr.alternate) {
thr.freelist = thr.alternate;
thr.alternate = nullptr;
thr.count = magazine_size;
} else {
getMagazine();
}
}
--thr.count;
void* p = thr.freelist;
#if VALGRIND
VALGRIND_MAKE_MEM_DEFINED(p, sizeof(void*));
#endif
thr.freelist = *(void**)p;
ASSERT(!thr.freelist == (thr.count == 0)); // freelist is empty if and only if count is 0
//check( p, true );
#else
void* p = freelist;
if (!p) getMagazine();
#if VALGRIND
VALGRIND_MAKE_MEM_DEFINED(p, sizeof(void*));
#endif
freelist = *(void**)p;
#endif
#if VALGRIND
VALGRIND_MALLOCLIKE_BLOCK( p, Size, 0, 0 );
#endif
#if defined(ALLOC_INSTRUMENTATION) || defined(ALLOC_INSTRUMENTATION_STDOUT)
recordAllocation(p, Size);
#endif
return p;
}
template<int Size>
void FastAllocator<Size>::release(void *ptr) {
if(!threadInitialized) {
initThread();
}
#ifdef USE_GPERFTOOLS
return free(ptr);
#endif
#if FASTALLOC_THREAD_SAFE
ThreadData& thr = threadData;
if (thr.count == magazine_size) {
if (thr.alternate) // Two full magazines, return one
releaseMagazine( thr.alternate );
thr.alternate = thr.freelist;
thr.freelist = nullptr;
thr.count = 0;
}
ASSERT(!thr.freelist == (thr.count == 0)); // freelist is empty if and only if count is 0
++thr.count;
*(void**)ptr = thr.freelist;
//check(ptr, false);
thr.freelist = ptr;
#else
*(void**)ptr = freelist;
freelist = ptr;
#endif
#if VALGRIND
VALGRIND_FREELIKE_BLOCK( ptr, 0 );
#endif
#if defined(ALLOC_INSTRUMENTATION) || defined(ALLOC_INSTRUMENTATION_STDOUT)
recordDeallocation( ptr );
#endif
}
template <int Size>
void FastAllocator<Size>::check(void* ptr, bool alloc) {
#if FAST_ALLOCATOR_DEBUG
//if (ptr == (void*)0x400200180)
// printf("%c%p\n", alloc?'+':'-', ptr);
// Check for pointers that aren't part of this FastAllocator
if (ptr < (void*)(((getSizeCode(Size)<<11) + 0) * magazine_size*Size) ||
ptr > (void*)(((getSizeCode(Size)<<11) + 4000) * magazine_size*Size) ||
(int64_t(ptr)&(Size-1)))
{
printf("Bad ptr: %p\n", ptr);
abort();
}
// Redundant freelist pointers to detect outright smashing of the freelist
if (alloc) {
if ( *((void**)ptr+1) != *(void**)ptr ) {
printf("Freelist corruption? %p %p\n", *(void**)ptr, *((void**)ptr+1));
abort();
}
*((void**)ptr+1) = (void*)0;
} else {
*((void**)ptr+1) = *(void**)ptr;
}
// Track allocated/free status in a completely separate data structure to detect double frees
int i = (int)((int64_t)ptr - ((getSizeCode(Size)<<11) + 0) * magazine_size*Size) / Size;
static std::vector<bool> isFreed;
if (!alloc) {
if (i+1 > isFreed.size())
isFreed.resize(i+1, false);
if (isFreed[i]) {
printf("Double free: %p\n", ptr);
abort();
}
isFreed[i] = true;
} else {
if (i+1 > isFreed.size()) {
printf("Allocate beyond end: %p\n", ptr);
abort();
}
if (!isFreed[i]) {
printf("Allocate non-freed: %p\n", ptr);
abort();
}
isFreed[i] = false;
}
#endif
}
template <int Size>
void FastAllocator<Size>::initThread() {
threadInitialized = true;
if (threadInitFunction) {
threadInitFunction();
}
EnterCriticalSection(&globalData()->mutex);
++globalData()->activeThreads;
LeaveCriticalSection(&globalData()->mutex);
threadData.freelist = nullptr;
threadData.alternate = nullptr;
threadData.count = 0;
}
template <int Size>
void FastAllocator<Size>::getMagazine() {
ASSERT(threadInitialized);
ASSERT(!threadData.freelist && !threadData.alternate && threadData.count == 0);
EnterCriticalSection(&globalData()->mutex);
if (globalData()->magazines.size()) {
void* m = globalData()->magazines.back();
globalData()->magazines.pop_back();
LeaveCriticalSection(&globalData()->mutex);
threadData.freelist = m;
threadData.count = magazine_size;
return;
} else if (globalData()->partial_magazines.size()) {
std::pair<int, void*> p = globalData()->partial_magazines.back();
globalData()->partial_magazines.pop_back();
globalData()->partialMagazineUnallocatedMemory -= p.first * Size;
LeaveCriticalSection(&globalData()->mutex);
threadData.freelist = p.second;
threadData.count = p.first;
return;
}
globalData()->totalMemory += magazine_size*Size;
LeaveCriticalSection(&globalData()->mutex);
// Allocate a new page of data from the system allocator
#ifdef ALLOC_INSTRUMENTATION
interlockedIncrement(&pageCount);
#endif
void** block = nullptr;
#if FAST_ALLOCATOR_DEBUG
#ifdef WIN32
static int alt = 0; alt++;
block = (void**)VirtualAllocEx( GetCurrentProcess(),
(void*)( ((getSizeCode(Size)<<11) + alt) * magazine_size*Size), magazine_size*Size, MEM_COMMIT|MEM_RESERVE, PAGE_READWRITE );
#else
static int alt = 0; alt++;
void* desiredBlock = (void*)( ((getSizeCode(Size)<<11) + alt) * magazine_size*Size);
block = (void**)mmap( desiredBlock, magazine_size*Size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0 );
ASSERT( block == desiredBlock );
#endif
#else
// FIXME: We should be able to allocate larger magazine sizes here if we
// detect that the underlying system supports hugepages. Using hugepages
// with smaller-than-2MiB magazine sizes strands memory. See issue #909.
if(FLOW_KNOBS && g_trace_depth == 0 && nondeterministicRandom()->random01() < (magazine_size * Size)/FLOW_KNOBS->FAST_ALLOC_LOGGING_BYTES) {
TraceEvent("GetMagazineSample").detail("Size", Size).backtrace();
}
block = (void **)::allocate(magazine_size * Size, false);
#endif
//void** block = new void*[ magazine_size * PSize ];
for(int i=0; i<magazine_size-1; i++) {
block[i*PSize+1] = block[i*PSize] = &block[(i+1)*PSize];
check( &block[i*PSize], false );
}
block[(magazine_size-1)*PSize+1] = block[(magazine_size-1)*PSize] = nullptr;
check( &block[(magazine_size-1)*PSize], false );
threadData.freelist = block;
threadData.count = magazine_size;
}
template <int Size>
void FastAllocator<Size>::releaseMagazine(void* mag) {
ASSERT(threadInitialized);
EnterCriticalSection(&globalData()->mutex);
globalData()->magazines.push_back(mag);
LeaveCriticalSection(&globalData()->mutex);
}
template <int Size>
void FastAllocator<Size>::releaseThreadMagazines() {
if(threadInitialized) {
threadInitialized = false;
ThreadData& thr = threadData;
EnterCriticalSection(&globalData()->mutex);
if (thr.freelist || thr.alternate) {
if (thr.freelist) {
ASSERT(thr.count > 0 && thr.count <= magazine_size);
globalData()->partial_magazines.push_back( std::make_pair(thr.count, thr.freelist) );
globalData()->partialMagazineUnallocatedMemory += thr.count * Size;
}
if (thr.alternate) {
globalData()->magazines.push_back(thr.alternate);
}
}
--globalData()->activeThreads;
LeaveCriticalSection(&globalData()->mutex);
thr.count = 0;
thr.alternate = nullptr;
thr.freelist = nullptr;
}
}
void releaseAllThreadMagazines() {
FastAllocator<16>::releaseThreadMagazines();
FastAllocator<32>::releaseThreadMagazines();
FastAllocator<64>::releaseThreadMagazines();
FastAllocator<96>::releaseThreadMagazines();
FastAllocator<128>::releaseThreadMagazines();
FastAllocator<256>::releaseThreadMagazines();
FastAllocator<512>::releaseThreadMagazines();
FastAllocator<1024>::releaseThreadMagazines();
FastAllocator<2048>::releaseThreadMagazines();
FastAllocator<4096>::releaseThreadMagazines();
FastAllocator<8192>::releaseThreadMagazines();
}
int64_t getTotalUnusedAllocatedMemory() {
int64_t unusedMemory = 0;
unusedMemory += FastAllocator<16>::getApproximateMemoryUnused();
unusedMemory += FastAllocator<32>::getApproximateMemoryUnused();
unusedMemory += FastAllocator<64>::getApproximateMemoryUnused();
unusedMemory += FastAllocator<96>::getApproximateMemoryUnused();
unusedMemory += FastAllocator<128>::getApproximateMemoryUnused();
unusedMemory += FastAllocator<256>::getApproximateMemoryUnused();
unusedMemory += FastAllocator<512>::getApproximateMemoryUnused();
unusedMemory += FastAllocator<1024>::getApproximateMemoryUnused();
unusedMemory += FastAllocator<2048>::getApproximateMemoryUnused();
unusedMemory += FastAllocator<4096>::getApproximateMemoryUnused();
unusedMemory += FastAllocator<8192>::getApproximateMemoryUnused();
return unusedMemory;
}
template class FastAllocator<16>;
template class FastAllocator<32>;
template class FastAllocator<64>;
template class FastAllocator<96>;
template class FastAllocator<128>;
template class FastAllocator<256>;
template class FastAllocator<512>;
template class FastAllocator<1024>;
template class FastAllocator<2048>;
template class FastAllocator<4096>;
template class FastAllocator<8192>;