/*
 * VFSAsync.cpp
 *
 * This source file is part of the FoundationDB open source project
 *
 * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include "sqlite/sqlite3.h"
#include <stdio.h>
#include <string>
#include <vector>
#include "fdbrpc/fdbrpc.h"
#include "fdbrpc/IAsyncFile.h"
#include "fdbserver/CoroFlow.h"
#include "fdbrpc/simulator.h"
#include "fdbrpc/AsyncFileReadAhead.actor.h"

#include <assert.h>
#include <string.h>

#ifdef WIN32
#include <Windows.h>
#endif

#ifdef __unixish__
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/file.h>
#include <sys/param.h>
#include <sys/time.h>
#include <unistd.h>
#include <errno.h>
#include <fcntl.h>
#endif

#include "fdbserver/VFSAsync.h"

/*
** The maximum pathname length supported by this VFS.
*/
#define MAXPATHNAME 512

#define NO_LOCK 0
#define SHARED_LOCK 1
#define RESERVED_LOCK 2
#define PENDING_LOCK 3
#define EXCLUSIVE_LOCK 4
const uint32_t RESERVED_COUNT = 1U << 29;

VFSAsyncFile::VFSAsyncFile(std::string const& filename, int flags)
  : flags(flags), filename(filename), pLockCount(&filename_lockCount_openCount[filename].first), debug_zcrefs(0),
    debug_zcreads(0), debug_reads(0), chunkSize(0) {
	filename_lockCount_openCount[filename].second++;

	TraceEvent(SevDebug, "VFSAsyncFileConstruct")
	    .detail("Filename", filename)
	    .detail("OpenCount", filename_lockCount_openCount[filename].second)
	    .detail("LockCount", filename_lockCount_openCount[filename].first)
	    .backtrace();
}

std::map<std::string, std::pair<uint32_t, int>> VFSAsyncFile::filename_lockCount_openCount;

static int asyncClose(sqlite3_file* pFile) {
	VFSAsyncFile* p = (VFSAsyncFile*)pFile;

	TraceEvent(SevDebug, "VFSAsyncFileDestroy").detail("Filename", p->filename).backtrace();

	// printf("Closing %s: %d zcrefs, %d/%d reads zc\n", filename.c_str(), debug_zcrefs, debug_zcreads,
	// debug_zcreads+debug_reads);
	ASSERT(!p->debug_zcrefs);

	p->~VFSAsyncFile();
	return SQLITE_OK;
}

static int asyncRead(sqlite3_file* pFile, void* zBuf, int iAmt, sqlite_int64 iOfst) {
	VFSAsyncFile* p = (VFSAsyncFile*)pFile;
	try {
		++p->debug_reads;
		int readBytes = waitForAndGet(p->file->read(zBuf, iAmt, iOfst));
		if (readBytes < iAmt) {
			memset((uint8_t*)zBuf + readBytes, 0, iAmt - readBytes); // When reading past the EOF, sqlite expects the
			                                                         // extra portion of the buffer to be zeroed
			return SQLITE_IOERR_SHORT_READ;
		}
		return SQLITE_OK;
	} catch (Error& e) {
		if (e.isInjectedFault()) {
			VFSAsyncFile::setInjectedError(SQLITE_IOERR_READ);
		}
		return SQLITE_IOERR_READ;
	}
}

#if 1
static int asyncReleaseZeroCopy(sqlite3_file* pFile, void* data, int iAmt, sqlite_int64 iOfst) {
	VFSAsyncFile* p = (VFSAsyncFile*)pFile;
	try {
		--p->debug_zcrefs;
		p->file->releaseZeroCopy(data, iAmt, iOfst);
	} catch (Error& e) {
		if (e.isInjectedFault()) {
			VFSAsyncFile::setInjectedError(SQLITE_IOERR);
		}
		return SQLITE_IOERR;
	}
	return SQLITE_OK;
}

static int asyncReadZeroCopy(sqlite3_file* pFile, void** data, int iAmt, sqlite_int64 iOfst, int* pDataWasCached) {
	VFSAsyncFile* p = (VFSAsyncFile*)pFile;
	try {
		int readBytes = iAmt;
		Future<Void> readFuture = p->file->readZeroCopy(data, &readBytes, iOfst);
		if (pDataWasCached)
			*pDataWasCached = readFuture.isReady() ? 1 : 0;
		waitFor(readFuture);
		++p->debug_zcrefs;
		if (readBytes < iAmt) {
			// When reading past the EOF, sqlite expects the extra portion of the buffer to be zeroed.  We can't do
			// that, so return and sqlite will use the slow path.
			asyncReleaseZeroCopy(pFile, *data, readBytes, iOfst);
			return SQLITE_IOERR_SHORT_READ;
		}
		++p->debug_zcreads;
		return SQLITE_OK;
	} catch (Error& e) {
		if (e.isInjectedFault()) {
			VFSAsyncFile::setInjectedError(SQLITE_IOERR_READ);
		}
		return SQLITE_IOERR_READ;
	}
}

#else
static int asyncReadZeroCopy(sqlite3_file* pFile, void** data, int iAmt, sqlite_int64 iOfst) {
	VFSAsyncFile* p = (VFSAsyncFile*)pFile;
	try {
		*data = new char[iAmt];
		int readBytes = waitForAndGet(p->file->read(*data, iAmt, iOfst));
		// printf("+asyncReadRef %p +%lld %d/%d = %p\n", pFile, iOfst, readBytes, iAmt, *data);
		if (readBytes < iAmt) {
			memset((uint8_t*)*data + readBytes, 0, iAmt - readBytes); // When reading past the EOF, sqlite expects the
			                                                          // extra portion of the buffer to be zeroed
			return SQLITE_IOERR_SHORT_READ;
		}
		return SQLITE_OK;
	} catch (Error& e) {
		if (e.isInjectedFault()) {
			VFSAsyncFile::setInjectedError(SQLITE_IOERR_READ);
		}
		return SQLITE_IOERR_READ;
	}
}
static int asyncReleaseZeroCopy(sqlite3_file* pFile, void* data, int iAmt, sqlite_int64 iOfst) {
	// printf("-asyncReleaseRef %p +%lld %d <= %p\n", pFile, iOfst, iAmt, data);
	delete[](char*) data;
	return SQLITE_OK;
}
#endif

static int asyncWrite(sqlite3_file* pFile, const void* zBuf, int iAmt, sqlite_int64 iOfst) {
	VFSAsyncFile* p = (VFSAsyncFile*)pFile;
	try {
		waitFor(p->file->write(zBuf, iAmt, iOfst));
		return SQLITE_OK;
	} catch (Error& e) {
		if (e.isInjectedFault()) {
			VFSAsyncFile::setInjectedError(SQLITE_IOERR_WRITE);
		}
		return SQLITE_IOERR_WRITE;
	}
}

static int asyncTruncate(sqlite3_file* pFile, sqlite_int64 size) {
	VFSAsyncFile* p = (VFSAsyncFile*)pFile;

	// Adjust size to a multiple of chunkSize if set
	if (p->chunkSize != 0) {
		size = ((size + p->chunkSize - 1) / p->chunkSize) * p->chunkSize;
	}

	try {
		waitFor(p->file->truncate(size));
		return SQLITE_OK;
	} catch (Error& e) {
		if (e.isInjectedFault()) {
			VFSAsyncFile::setInjectedError(SQLITE_IOERR_TRUNCATE);
		}
		return SQLITE_IOERR_TRUNCATE;
	}
}

static int asyncSync(sqlite3_file* pFile, int flags) {
	VFSAsyncFile* p = (VFSAsyncFile*)pFile;
	try {
		waitFor(p->file->sync());
		return SQLITE_OK;
	} catch (Error& e) {
		if (e.isInjectedFault()) {
			VFSAsyncFile::setInjectedError(SQLITE_IOERR_FSYNC);
		}

		TraceEvent("VFSAsyncFileSyncError")
		    .error(e)
		    .detail("Filename", p->filename)
		    .detail("Sqlite3File", (int64_t)pFile)
		    .detail("IAsyncFile", (int64_t)p->file.getPtr());

		return SQLITE_IOERR_FSYNC;
	}
}

/*
** Write the size of the file in bytes to *pSize.
*/
static int VFSAsyncFileSize(sqlite3_file* pFile, sqlite_int64* pSize) {
	VFSAsyncFile* p = (VFSAsyncFile*)pFile;
	try {
		*pSize = waitForAndGet(p->file->size());
		return SQLITE_OK;
	} catch (Error& e) {
		if (e.isInjectedFault()) {
			VFSAsyncFile::setInjectedError(SQLITE_IOERR_FSTAT);
		}
		return SQLITE_IOERR_FSTAT;
	}
}

static int asyncLock(sqlite3_file* pFile, int eLock) {
	// VFSAsyncFile *p = (VFSAsyncFile*)pFile;

	//TraceEvent("FileLock").detail("File", p->filename).detail("Fd", p->file->debugFD()).detail("PrevLockLevel", p->lockLevel).detail("Op", eLock).detail("LockCount", *p->pLockCount);

	return eLock == EXCLUSIVE_LOCK ? SQLITE_BUSY : SQLITE_OK;
}
static int asyncUnlock(sqlite3_file* pFile, int eLock) {
	assert(eLock <= SHARED_LOCK);

	return SQLITE_OK;
}
static int asyncCheckReservedLock(sqlite3_file* pFile, int* pResOut) {
	VFSAsyncFile* p = (VFSAsyncFile*)pFile;
	*pResOut = *p->pLockCount >= RESERVED_COUNT;
	return SQLITE_OK;
}

/*
** No xFileControl() verbs are implemented by this VFS.
*/
static int VFSAsyncFileControl(sqlite3_file* pFile, int op, void* pArg) {
	VFSAsyncFile* p = (VFSAsyncFile*)pFile;
	switch (op) {
	case SQLITE_FCNTL_CHUNK_SIZE:
		p->chunkSize = *(int*)pArg;
		return SQLITE_OK;

	case SQLITE_FCNTL_SIZE_HINT:
		return asyncTruncate(pFile, *(int64_t*)pArg);

	default:
		return SQLITE_NOTFOUND;
	};
}

static int asyncSectorSize(sqlite3_file* pFile) {
	return 512;
} // SOMEDAY: Would 4K be better?
static int asyncDeviceCharacteristics(sqlite3_file* pFile) {
	return 0;
}

#if 1
struct SharedMemoryInfo { // for a file
	std::string filename;
	std::vector<void*> regions;
	int regionSize;
	int refcount; // Number of connections with this open
	int sharedLocks[SQLITE_SHM_NLOCK];
	int exclusiveLocks[SQLITE_SHM_NLOCK];

	SharedMemoryInfo() : regionSize(0), refcount(0) {
		memset(sharedLocks, 0, sizeof(sharedLocks));
		memset(exclusiveLocks, 0, sizeof(exclusiveLocks));
	}
	void cleanup() {
		for (int i = 0; i < regions.size(); i++)
			delete[](uint8_t*) regions[i];
		table.erase(filename);
	}

	static Mutex mutex;
	static std::map<std::string, SharedMemoryInfo> table;
};
Mutex SharedMemoryInfo::mutex;
std::map<std::string, SharedMemoryInfo> SharedMemoryInfo::table;

/*
** This function is called to obtain a pointer to region iRegion of the
** shared-memory associated with the database file fd. Shared-memory regions
** are numbered starting from zero. Each shared-memory region is szRegion
** bytes in size.
**
** If an error occurs, an error code is returned and *pp is set to nullptr.
**
** Otherwise, if the bExtend parameter is 0 and the requested shared-memory
** region has not been allocated (by any client, including one running in a
** separate process), then *pp is set to nullptr and SQLITE_OK returned. If
** bExtend is non-zero and the requested shared-memory region has not yet
** been allocated, it is allocated by this function.
**
** If the shared-memory region has already been allocated or is allocated by
** this call as described above, then it is mapped into this processes
** address space (if it is not already), *pp is set to point to the mapped
** memory and SQLITE_OK returned.
*/
static int asyncShmMap(sqlite3_file* fd, /* Handle open on database file */
                       int iRegion, /* Region to retrieve */
                       int szRegion, /* Size of regions */
                       int bExtend, /* True to extend file if necessary */
                       void volatile** pp /* OUT: Mapped memory */
) {
	MutexHolder hold(SharedMemoryInfo::mutex);

	VFSAsyncFile* pDbFd = (VFSAsyncFile*)fd;
	SharedMemoryInfo* memInfo = pDbFd->sharedMemory;
	if (!memInfo) {
		std::string filename = pDbFd->filename;
		memInfo = pDbFd->sharedMemory = &SharedMemoryInfo::table[filename];
		memInfo->filename = filename;
		memInfo->regionSize = szRegion;
		++memInfo->refcount;
		// printf("Shared memory for: '%s' (%d refs)\n", filename.c_str(), memInfo->refcount);
	} else {
		assert(memInfo->regionSize == szRegion);
	}

	if (iRegion >= memInfo->regions.size()) {
		if (!bExtend) {
			*pp = nullptr;
			return SQLITE_OK;
		}
		while (memInfo->regions.size() <= iRegion) {
			void* mem = new uint8_t[szRegion];
			memset(mem, 0, szRegion);
			memInfo->regions.push_back(mem);
		}
	}
	*pp = memInfo->regions[iRegion];
	return SQLITE_OK;
}

/*
** Change the lock state for a shared-memory segment.
**
** Note that the relationship between SHAREd and EXCLUSIVE locks is a little
** different here than in posix.  In xShmLock(), one can go from unlocked
** to shared and back or from unlocked to exclusive and back.  But one may
** not go from shared to exclusive or from exclusive to shared.
*/
// sqlite doesn't seem to match these up correctly - it happily calls unlock on locks it doesn't hold.
// So we have to keep track of which locks are held by a given sqlite3_file
static int asyncShmLock(sqlite3_file* fd, /* Database file holding the shared memory */
                        int ofst, /* First lock to acquire or release */
                        int n, /* Number of locks to acquire or release */
                        int flags /* What to do with the lock */
) {
	assert(ofst >= 0 && ofst + n <= SQLITE_SHM_NLOCK);
	assert(n >= 1);
	assert(flags == (SQLITE_SHM_LOCK | SQLITE_SHM_SHARED) || flags == (SQLITE_SHM_LOCK | SQLITE_SHM_EXCLUSIVE) ||
	       flags == (SQLITE_SHM_UNLOCK | SQLITE_SHM_SHARED) || flags == (SQLITE_SHM_UNLOCK | SQLITE_SHM_EXCLUSIVE));
	assert(n == 1 || (flags & SQLITE_SHM_EXCLUSIVE) != 0);

	MutexHolder hold(SharedMemoryInfo::mutex);

	VFSAsyncFile* pDbFd = (VFSAsyncFile*)fd;
	SharedMemoryInfo* memInfo = pDbFd->sharedMemory;

	if (flags & SQLITE_SHM_UNLOCK) {
		for (int i = ofst; i < ofst + n; i++) {
			if (pDbFd->sharedMemorySharedLocks & (1 << i)) {
				pDbFd->sharedMemorySharedLocks &= ~(1 << i);
				--memInfo->sharedLocks[i];
			}
			if (pDbFd->sharedMemoryExclusiveLocks & (1 << i)) {
				pDbFd->sharedMemoryExclusiveLocks &= ~(1 << i);
				--memInfo->exclusiveLocks[i];
			}
		}
	} else if (flags & SQLITE_SHM_SHARED) {
		for (int i = ofst; i < ofst + n; i++)
			if (memInfo->exclusiveLocks[i] != ((pDbFd->sharedMemoryExclusiveLocks >> i) & 1)) {
				//TraceEvent("ShmLocked").detail("File", DEBUG_DETERMINISM ? 0 : (int64_t)pDbFd).detail("Acquiring", "Shared").detail("I", i).detail("Exclusive", memInfo->exclusiveLocks[i]).detail("MyExclusive", pDbFd->sharedMemoryExclusiveLocks);
				return SQLITE_BUSY;
			}
		for (int i = ofst; i < ofst + n; i++)
			if (!(pDbFd->sharedMemorySharedLocks & (1 << i))) {
				pDbFd->sharedMemorySharedLocks |= 1 << i;
				memInfo->sharedLocks[i]++;
			}
	} else {
		for (int i = ofst; i < ofst + n; i++)
			if (memInfo->exclusiveLocks[i] != ((pDbFd->sharedMemoryExclusiveLocks >> i) & 1) ||
			    memInfo->sharedLocks[i] != ((pDbFd->sharedMemorySharedLocks >> i) & 1)) {
				//TraceEvent("ShmLocked").detail("File", DEBUG_DETERMINISM ? 0 : (int64_t)pDbFd).detail("Acquiring", "Exclusive").detail("I", i).detail("Exclusive", memInfo->exclusiveLocks[i]).detail("MyExclusive", pDbFd->sharedMemoryExclusiveLocks).detail("Shared", memInfo->sharedLocks[i]).detail("MyShared", pDbFd->sharedMemorySharedLocks);
				return SQLITE_BUSY;
			}
		for (int i = ofst; i < ofst + n; i++)
			if (!(pDbFd->sharedMemoryExclusiveLocks & (1 << i))) {
				pDbFd->sharedMemoryExclusiveLocks |= 1 << i;
				memInfo->exclusiveLocks[i]++;
			}
	}
	return SQLITE_OK;
}

/*
** Implement a memory barrier or memory fence on shared memory.
**
** All loads and stores begun before the barrier must complete before
** any load or store begun after the barrier.
*/
static void asyncShmBarrier(sqlite3_file*) {
#if WIN32
	_ReadWriteBarrier();
#else
	__sync_synchronize();
#endif
}

/*
** Close a connection to shared-memory.  Delete the underlying
** storage if deleteFlag is true.
**
** If there is no shared memory associated with the connection then this
** routine is a harmless no-op.
*/
static int asyncShmUnmap(sqlite3_file* fd, /* The underlying database file */
                         int deleteFlag /* Delete shared-memory if true */
) {
	MutexHolder hold(SharedMemoryInfo::mutex);

	VFSAsyncFile* pDbFd = (VFSAsyncFile*)fd;
	SharedMemoryInfo* memInfo = pDbFd->sharedMemory;
	if (!memInfo)
		return SQLITE_OK;
	pDbFd->sharedMemory = 0;

	// printf("Connection %p closed shared memory\n", fd);

	if (!--memInfo->refcount) {
		// printf("Cleanup shared memory for: '%s' (%d refs; deleteFlag=%d)\n", memInfo->filename.c_str(),
		// memInfo->refcount, deleteFlag); printf("  Shared locks: "); for(int i=0; i<8; i++) printf("%d ",
		// memInfo->sharedLocks[i]); printf("\n"); printf("  Exclusive locks: "); for(int i=0; i<8; i++) printf("%d ",
		// memInfo->exclusiveLocks[i]); printf("\n");

		//TraceEvent("CleanupSharedMemory").detail("Filename", memInfo->filename.c_str()).detail("RefCount", memInfo->refcount).detail("DeleteFlag", deleteFlag);
		// for(int i = 0; i < 8; i++)
		//TraceEvent("CleanupSharedMemory_Locks").detail("Filename", memInfo->filename.c_str()).detail("Num", i).detail("Shared", memInfo->sharedLocks[i]).detail("Exclusive", memInfo->exclusiveLocks[i]);

		// We don't think deleteFlag will ever be set
		ASSERT(!deleteFlag);
	}
	return SQLITE_OK;
}

VFSAsyncFile::~VFSAsyncFile() {

	TraceEvent(SevDebug, "VFSAsyncFileDestroyStart")
	    .detail("Filename", filename)
	    .detail("OpenCount", filename_lockCount_openCount[filename].second)
	    .detail("LockCount", filename_lockCount_openCount[filename].first)
	    .backtrace();

	if (!--filename_lockCount_openCount[filename].second) {
		filename_lockCount_openCount.erase(filename);

		TraceEvent(SevDebug, "VFSAsyncFileDestroy").detail("Filename", filename).backtrace();

		// Always delete the shared memory when the last copy of the file is deleted.  In simulation, this is helpful
		// because "killing" a file without properly closing it can result in a shared memory state that causes
		// corruption when reopening the killed file.  The only expected penalty from doing this is a potentially slower
		// open operation on a database, but that should happen infrequently.
		//
		// We can't do this in ShmUnmap when refcount is 0 because it seems that SQLite sometimes subsequently tries to
		// reopen the WAL from multiple locations simultaneously, resulting in a locking error
		auto itr = SharedMemoryInfo::table.find(filename);
		if (itr != SharedMemoryInfo::table.end()) {
			ASSERT_ABORT(itr->second.refcount == 0);
			itr->second.cleanup();
		}
	}
}

#endif

/*
** Open a file handle.
*/
static int asyncOpen(sqlite3_vfs* pVfs, /* VFS */
                     const char* zName, /* File to open, or 0 for a temp file */
                     sqlite3_file* pFile, /* Pointer to VFSAsyncFile struct to populate */
                     int flags, /* Input SQLITE_OPEN_XXX flags */
                     int* pOutFlags /* Output SQLITE_OPEN_XXX flags (or nullptr) */
) {
	static const sqlite3_io_methods asyncio = { 3, /* iVersion */
		                                        asyncClose, /* xClose */
		                                        asyncRead, /* xRead */
		                                        asyncWrite, /* xWrite */
		                                        asyncTruncate, /* xTruncate */
		                                        asyncSync, /* xSync */
		                                        VFSAsyncFileSize, /* xFileSize */
		                                        asyncLock, /* xLock */
		                                        asyncUnlock, /* xUnlock */
		                                        asyncCheckReservedLock, /* xCheckReservedLock */
		                                        VFSAsyncFileControl, /* xFileControl */
		                                        asyncSectorSize, /* xSectorSize */
		                                        asyncDeviceCharacteristics, /* xDeviceCharacteristics */
		                                        asyncShmMap,
		                                        asyncShmLock,
		                                        asyncShmBarrier,
		                                        asyncShmUnmap,
		                                        asyncReadZeroCopy,
		                                        asyncReleaseZeroCopy };

	VFSAsyncFile* p = (VFSAsyncFile*)pFile; /* Populate this structure */

	if (zName == 0)
		return SQLITE_IOERR;

	static_assert(
	    SQLITE_OPEN_EXCLUSIVE == IAsyncFile::OPEN_EXCLUSIVE && SQLITE_OPEN_CREATE == IAsyncFile::OPEN_CREATE &&
	        SQLITE_OPEN_READONLY == IAsyncFile::OPEN_READONLY && SQLITE_OPEN_READWRITE == IAsyncFile::OPEN_READWRITE,
	    "SQLite flag values don't match IAsyncFile flag values");

	// File creation here is disabled because we always create the files first in KeyValueStoreSQLite, using atomic
	// creation
	int oflags =
	    flags & (/*SQLITE_OPEN_EXCLUSIVE | SQLITE_OPEN_CREATE |*/ SQLITE_OPEN_READONLY | SQLITE_OPEN_READWRITE);
	if (flags & SQLITE_OPEN_WAL)
		oflags |= IAsyncFile::OPEN_LARGE_PAGES;
	oflags |= IAsyncFile::OPEN_LOCK;

	memset(static_cast<void*>(p), 0, sizeof(VFSAsyncFile));
	new (p) VFSAsyncFile(zName, flags);
	try {
		// Note that SQLiteDB::open also opens the db file, so its flags and modes are important, too
		p->file = waitForAndGet(IAsyncFileSystem::filesystem()->open(p->filename, oflags, 0600));

		TraceEvent(SevDebug, "VFSAsyncFileOpened").detail("Filename", p->filename).backtrace();

	} catch (Error& e) {
		if (e.isInjectedFault()) {
			VFSAsyncFile::setInjectedError(SQLITE_CANTOPEN);
		}
		TraceEvent("VFSAsyncFileOpenError").error(e).detail("Filename", p->filename);
		p->~VFSAsyncFile();
		return SQLITE_CANTOPEN;
	}

	if (pOutFlags) {
		*pOutFlags = flags;
	}
	p->base.pMethods = &asyncio;
	return SQLITE_OK;
}

// The next few functions, which perform filesystem operations by path rather than by file, have
// OS-specific implementations.

/*
** Delete the file identified by argument zPath. If the dirSync parameter
** is non-zero, then ensure the file-system modification to delete the
** file has been synced to disk before returning.
*/
static int asyncDelete(sqlite3_vfs* pVfs, const char* zPath, int dirSync) {
	ASSERT(false); // At the moment this isn't used; hence isn't under test.  Could easily use
	               // IAsyncFileSystem::filesystem()->deleteFile().
	return SQLITE_IOERR_DELETE;
}

/*
** Query the file-system to see if the named file exists, is readable or
** is both readable and writable.  For an exists query, treat a zero-length file
** as if it does not exist.
*/
static int asyncAccess(sqlite3_vfs* pVfs, const char* zPath, int flags, int* pResOut) {
#ifdef __unixish__
#ifndef F_OK
#define F_OK 0
#endif
#ifndef R_OK
#define R_OK 4
#endif
#ifndef W_OK
#define W_OK 2
#endif
	int rc; /* access() return code */
	int eAccess = F_OK; /* Second argument to access() */

	assert(flags == SQLITE_ACCESS_EXISTS /* access(zPath, F_OK) */
	       || flags == SQLITE_ACCESS_READ /* access(zPath, R_OK) */
	       || flags == SQLITE_ACCESS_READWRITE /* access(zPath, R_OK|W_OK) */
	);

	if (flags == SQLITE_ACCESS_READWRITE)
		eAccess = R_OK | W_OK;
	if (flags == SQLITE_ACCESS_READ)
		eAccess = R_OK;

	rc = access(zPath, eAccess);
	*pResOut = (rc == 0);

	if (flags == SQLITE_ACCESS_EXISTS && *pResOut) {
		struct stat buf;
		if (0 == stat(zPath, &buf) && buf.st_size == 0) {
			*pResOut = 0;
		}
	}
	return SQLITE_OK;
#else
	WIN32_FILE_ATTRIBUTE_DATA data;
	DWORD attr = INVALID_FILE_ATTRIBUTES;
	memset(&data, 0, sizeof(data));
	if (GetFileAttributesEx(zPath, GetFileExInfoStandard, &data)) {
		if (!(flags == SQLITE_ACCESS_EXISTS && data.nFileSizeHigh == 0 && data.nFileSizeLow == 0))
			attr = data.dwFileAttributes;
	} else if (GetLastError() != ERROR_FILE_NOT_FOUND)
		return SQLITE_IOERR_ACCESS;

	if (flags == SQLITE_ACCESS_READWRITE)
		*pResOut = (attr & FILE_ATTRIBUTE_READONLY) == 0;
	else
		*pResOut = attr != INVALID_FILE_ATTRIBUTES;
	return SQLITE_OK;
#endif
}

/*
** Argument zPath points to a nul-terminated string containing a file path.
** If zPath is an absolute path, then it is copied as is into the output
** buffer. Otherwise, if it is a relative path, then the equivalent full
** path is written to the output buffer.
*/
static int asyncFullPathname(sqlite3_vfs* pVfs, /* VFS */
                             const char* zPath, /* Input path (possibly a relative path) */
                             int nPathOut, /* Size of output buffer in bytes */
                             char* zPathOut /* Pointer to output buffer */
) {
	try {
		auto s = abspath(zPath);
		if (s.size() >= nPathOut)
			return SQLITE_IOERR;
		memcpy(zPathOut, s.c_str(), s.size() + 1);
		return SQLITE_OK;
	} catch (Error& e) {
		if (e.isInjectedFault()) {
			VFSAsyncFile::setInjectedError(SQLITE_IOERR);
		}
		TraceEvent(SevError, "VFSAsyncFullPathnameError").error(e).detail("PathIn", (std::string)zPath);
		return SQLITE_IOERR;
	} catch (...) {
		TraceEvent(SevError, "VFSAsyncFullPathnameError").error(unknown_error()).detail("PathIn", (std::string)zPath);
		return SQLITE_IOERR;
	}
}

/*
** Returns true if there is a shared memory entry for the specified filename,
** and false otherwise.
*/
bool vfsAsyncIsOpen(std::string filename) {
	return SharedMemoryInfo::table.count(abspath(filename)) > 0;
}

/*
** The following four VFS methods:
**
**   xDlOpen
**   xDlError
**   xDlSym
**   xDlClose
**
** are supposed to implement the functionality needed by SQLite to load
** extensions compiled as shared objects. This simple VFS does not support
** this functionality, so the following functions are no-ops.
*/
static void* asyncDlOpen(sqlite3_vfs* pVfs, const char* zPath) {
	return 0;
}
static void asyncDlError(sqlite3_vfs* pVfs, int nByte, char* zErrMsg) {
	sqlite3_snprintf(nByte, zErrMsg, "Loadable extensions are not supported");
	zErrMsg[nByte - 1] = '\0';
}
static void (*asyncDlSym(sqlite3_vfs* pVfs, void* pH, const char* z))(void) {
	return 0;
}
static void asyncDlClose(sqlite3_vfs* pVfs, void* pHandle) {
	return;
}

/*
** Parameter zByte points to a buffer nByte bytes in size. Populate this
** buffer with pseudo-random data.
*/
static int asyncRandomness(sqlite3_vfs* pVfs, int nByte, char* zByte) {
	for (int i = 0; i < nByte; i++)
		zByte[i] = deterministicRandom()->randomInt(0, 256);
	return SQLITE_OK;
}

/*
** Sleep for at least nMicro microseconds. Return the (approximate) number
** of microseconds slept for.
*/
static int asyncSleep(sqlite3_vfs* pVfs, int microseconds) {
	try {
		Future<Void> simCancel = Never();
		if (g_network->isSimulated())
			simCancel = success(g_simulator.getCurrentProcess()->shutdownSignal.getFuture());
		if (simCancel.isReady()) {
			waitFor(delay(FLOW_KNOBS->MAX_BUGGIFIED_DELAY));
			return 0;
		}
		waitFor(g_network->delay(microseconds * 1e-6, TaskPriority::DefaultDelay) || simCancel);
		return microseconds;
	} catch (Error& e) {
		if (e.isInjectedFault()) {
			VFSAsyncFile::setInjectedError(SQLITE_ERROR);
		}
		TraceEvent(SevError, "VFSAsyncSleepError").errorUnsuppressed(e);
		return 0;
	}
}

/*
** Find the current time (in Universal Coordinated Time).  Write into *piNow
** the current time and date as a Julian Day number times 86_400_000.  In
** other words, write into *piNow the number of milliseconds since the Julian
** epoch of noon in Greenwich on November 24, 4714 B.C according to the
** proleptic Gregorian calendar.
**
** On success, return 0.  Return 1 if the time and date cannot be found.
*/
static int asyncCurrentTimeInt64(sqlite3_vfs* NotUsed, sqlite3_int64* piNow) {
#if __unixish__
	static const sqlite3_int64 unixEpoch = 24405875 * (sqlite3_int64)8640000;
	struct timeval sNow;
	gettimeofday(&sNow, nullptr);
	*piNow = unixEpoch + 1000 * (sqlite3_int64)sNow.tv_sec + sNow.tv_usec / 1000;
#elif defined(_WIN32)
	static const sqlite3_int64 winFiletimeEpoch = 23058135 * (sqlite3_int64)8640000;
	int64_t ft = 0;
	GetSystemTimeAsFileTime((FILETIME*)&ft);
	*piNow = winFiletimeEpoch + ft / 10000;
#else
#error Port me!
#endif
	return 0;
}

/*
** Set *pTime to the current UTC time expressed as a Julian day. Return
** SQLITE_OK if successful, or an error code otherwise.
**
**   http://en.wikipedia.org/wiki/Julian_day
*/
static int asyncCurrentTime(sqlite3_vfs* pVfs, double* pTime) {
	sqlite3_int64 t = 0;
	int rc = asyncCurrentTimeInt64(pVfs, &t);
	if (rc)
		return rc;
	*pTime = t / 86400000.0;
	return SQLITE_OK;
}

static int asyncGetLastError(sqlite3_vfs* NotUsed, int NotUsed2, char* NotUsed3) {
	return 0;
}

/*
** This function returns a pointer to the VFS implemented in this file.
** To make the VFS available to SQLite:
**
**   sqlite3_vfs_register(sqlite3_asyncvfs(), 0);
*/
sqlite3_vfs* vfsAsync() {
	static sqlite3_vfs asyncvfs = {
		3, /* iVersion */
		sizeof(VFSAsyncFile), /* szOsFile */
		MAXPATHNAME, /* mxPathname */
		0, /* pNext */
		"fdb_async", /* zName */
		0, /* pAppData */
		asyncOpen, /* xOpen */
		asyncDelete, /* xDelete */
		asyncAccess, /* xAccess */
		asyncFullPathname, /* xFullPathname */
		asyncDlOpen, /* xDlOpen */
		asyncDlError, /* xDlError */
		asyncDlSym, /* xDlSym */
		asyncDlClose, /* xDlClose */
		asyncRandomness, /* xRandomness */
		asyncSleep, /* xSleep */
		asyncCurrentTime, /* xCurrentTime */
		asyncGetLastError, /* xGetLastError */
		asyncCurrentTimeInt64, /* xCurrentTimeInt64 */
		0, /* xSetSystemCall */
		0, /* xGetSystemCall */
		0, /* xNextSystemCall */

	};
	return &asyncvfs;
}