/*
 * VFSAsync.cpp
 *
 * This source file is part of the FoundationDB open source project
 *
 * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include "sqlite/sqlite3.h"
#include <stdio.h>
#include <string>
#include <vector>
#include "fdbrpc/fdbrpc.h"
#include "fdbrpc/IAsyncFile.h"
#include "fdbserver/CoroFlow.h"
#include "fdbrpc/simulator.h"
#include "fdbrpc/AsyncFileReadAhead.actor.h"

#include <assert.h>
#include <string.h>

#ifdef WIN32
#include <Windows.h>
#endif

#ifdef __unixish__
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/file.h>
#include <sys/param.h>
#include <sys/time.h>
#include <unistd.h>
#include <errno.h>
#include <fcntl.h>
#endif

#include "fdbserver/VFSAsync.h"

/*
** The maximum pathname length supported by this VFS.
*/
#define MAXPATHNAME 512

#define NO_LOCK         0
#define SHARED_LOCK     1
#define RESERVED_LOCK   2
#define PENDING_LOCK    3
#define EXCLUSIVE_LOCK  4
const uint32_t RESERVED_COUNT = 1U<<29;

VFSAsyncFile::VFSAsyncFile(std::string const& filename, int flags)
: filename(filename), flags(flags), pLockCount(&filename_lockCount_openCount[filename].first), debug_zcrefs(0), debug_zcreads(0), debug_reads(0), chunkSize(0), injectedError(false) {
	filename_lockCount_openCount[filename].second++;
}

std::map<std::string, std::pair<uint32_t,int>> VFSAsyncFile::filename_lockCount_openCount;

static int asyncClose(sqlite3_file *pFile){
	VFSAsyncFile *p = (VFSAsyncFile*)pFile;

	/*TraceEvent("VFSAsyncClose").detail("Fd", p->file->debugFD())
		.detail("Filename", p->filename).detail("ZCRefs", p->debug_zcrefs)
		.detail("ZCReads", p->debug_zcreads).detail("NormalReads", p->debug_reads).backtrace();*/
	//printf("Closing %s: %d zcrefs, %d/%d reads zc\n", filename.c_str(), debug_zcrefs, debug_zcreads, debug_zcreads+debug_reads);
	ASSERT( !p->debug_zcrefs );

	p->~VFSAsyncFile();
	return SQLITE_OK;
}

static int asyncRead(sqlite3_file *pFile, void *zBuf, int iAmt, sqlite_int64 iOfst) {
	VFSAsyncFile *p = (VFSAsyncFile*)pFile;
	try {
		++p->debug_reads;
		int readBytes = waitForAndGet( p->file->read( zBuf, iAmt, iOfst ) );
		if (readBytes < iAmt) {
			memset((uint8_t*)zBuf + readBytes, 0, iAmt-readBytes);  // When reading past the EOF, sqlite expects the extra portion of the buffer to be zeroed
			return SQLITE_IOERR_SHORT_READ;
		}
		return SQLITE_OK;
	} catch (Error &e) {
		if(e.isInjectedFault()) {
			((VFSAsyncFile *)pFile)->setInjectedError();
		}
		return SQLITE_IOERR_READ;
	}
}

#if 1
static int asyncReleaseZeroCopy(sqlite3_file* pFile, void* data, int iAmt, sqlite_int64 iOfst) {
	VFSAsyncFile *p = (VFSAsyncFile*)pFile;
	try{
		--p->debug_zcrefs;
		p->file->releaseZeroCopy( data, iAmt, iOfst );
	} catch (Error &e) {
		if(e.isInjectedFault()) {
			((VFSAsyncFile *)pFile)->setInjectedError();
		}
		return SQLITE_IOERR;
	}
	return SQLITE_OK;
}

static int asyncReadZeroCopy(sqlite3_file *pFile, void **data, int iAmt, sqlite_int64 iOfst, int *pDataWasCached) {
	VFSAsyncFile *p = (VFSAsyncFile*)pFile;
	try {
		int readBytes = iAmt;
		Future<Void> readFuture = p->file->readZeroCopy( data, &readBytes, iOfst );
		if(pDataWasCached)
			*pDataWasCached = readFuture.isReady() ? 1 : 0;
		waitFor(readFuture);
		++p->debug_zcrefs;
		if (readBytes < iAmt) {
			// When reading past the EOF, sqlite expects the extra portion of the buffer to be zeroed.  We can't do that, so return and sqlite will use the slow path.
			asyncReleaseZeroCopy(pFile, *data, readBytes, iOfst);
			return SQLITE_IOERR_SHORT_READ;
		}
		++p->debug_zcreads;
		return SQLITE_OK;
	} catch (Error &e) {
		if(e.isInjectedFault()) {
			((VFSAsyncFile *)pFile)->setInjectedError();
		}
		return SQLITE_IOERR_READ;
	}
}

#else
static int asyncReadZeroCopy(sqlite3_file *pFile, void **data, int iAmt, sqlite_int64 iOfst) {
	VFSAsyncFile *p = (VFSAsyncFile*)pFile;
	try {
		*data = new char[iAmt];
		int readBytes = waitForAndGet( p->file->read( *data, iAmt, iOfst ) );
        //printf("+asyncReadRef %p +%lld %d/%d = %p\n", pFile, iOfst, readBytes, iAmt, *data);
		if (readBytes < iAmt) {
			memset((uint8_t*)*data + readBytes, 0, iAmt-readBytes);  // When reading past the EOF, sqlite expects the extra portion of the buffer to be zeroed
			return SQLITE_IOERR_SHORT_READ;
		}
		return SQLITE_OK;
	} catch (Error &e) {
		if(e.isInjectedFault()) {
			((VFSAsyncFile *)pFile)->errorInjected = true;
		}
		return SQLITE_IOERR_READ;
	}
}
static int asyncReleaseZeroCopy(sqlite3_file* pFile, void* data, int iAmt, sqlite_int64 iOfst) {
    //printf("-asyncReleaseRef %p +%lld %d <= %p\n", pFile, iOfst, iAmt, data);
	delete[] (char*)data;
	return SQLITE_OK;
}
#endif

static int asyncWrite(sqlite3_file *pFile, const void *zBuf, int iAmt, sqlite_int64 iOfst) {
	VFSAsyncFile *p = (VFSAsyncFile*)pFile;
	try {
		waitFor( p->file->write( zBuf, iAmt, iOfst ) );
		return SQLITE_OK;
	} catch(Error &e) {
		if(e.isInjectedFault()) {
			((VFSAsyncFile *)pFile)->setInjectedError();
		}
		return SQLITE_IOERR_WRITE;
	}
}

static int asyncTruncate(sqlite3_file *pFile, sqlite_int64 size){
	VFSAsyncFile *p = (VFSAsyncFile*)pFile;

	// Adjust size to a multiple of chunkSize if set
	if(p->chunkSize != 0) {
		size = ((size + p->chunkSize - 1) / p->chunkSize) * p->chunkSize;
	}

	try {
		waitFor( p->file->truncate( size ) );
		return SQLITE_OK;
	} catch(Error &e) {
		if(e.isInjectedFault()) {
			((VFSAsyncFile *)pFile)->setInjectedError();
		}
		return SQLITE_IOERR_TRUNCATE;
	}
}

static int asyncSync(sqlite3_file *pFile, int flags){
	VFSAsyncFile *p = (VFSAsyncFile*)pFile;
	try {
		waitFor( p->file->sync() );
		return SQLITE_OK;
	} catch (Error &e) {
		if(e.isInjectedFault()) {
			((VFSAsyncFile *)pFile)->setInjectedError();
		}

		TraceEvent("VFSSyncError")
			.error(e)
			.detail("Filename", p->filename)
			.detail("Sqlite3File", (int64_t)pFile)
			.detail("IAsyncFile", (int64_t)p->file.getPtr());
		
		return SQLITE_IOERR_FSYNC;
	}
}

/*
** Write the size of the file in bytes to *pSize.
*/
static int VFSAsyncFileSize(sqlite3_file *pFile, sqlite_int64 *pSize){
	VFSAsyncFile *p = (VFSAsyncFile*)pFile;
	try {
		*pSize = waitForAndGet( p->file->size() );
		return SQLITE_OK;
	} catch (Error &e) {
		if(e.isInjectedFault()) {
			((VFSAsyncFile *)pFile)->setInjectedError();
		}
		return SQLITE_IOERR_FSTAT;
	}
}

static int asyncLock(sqlite3_file *pFile, int eLock){
	//VFSAsyncFile *p = (VFSAsyncFile*)pFile;

	//TraceEvent("FileLock").detail("File", p->filename).detail("Fd", p->file->debugFD()).detail("PrevLockLevel", p->lockLevel).detail("Op", eLock).detail("LockCount", *p->pLockCount);

	return eLock == EXCLUSIVE_LOCK ? SQLITE_BUSY : SQLITE_OK;
}
static int asyncUnlock(sqlite3_file *pFile, int eLock) {
	assert( eLock <= SHARED_LOCK );

	return SQLITE_OK;
}
static int asyncCheckReservedLock(sqlite3_file *pFile, int *pResOut){
	VFSAsyncFile *p = (VFSAsyncFile*)pFile;
	*pResOut = *p->pLockCount >= RESERVED_COUNT;
	return SQLITE_OK;
}

/*
** No xFileControl() verbs are implemented by this VFS.
*/
static int VFSAsyncFileControl(sqlite3_file *pFile, int op, void *pArg){
	VFSAsyncFile *p = (VFSAsyncFile*)pFile;
	switch(op) {
		case SQLITE_FCNTL_CHUNK_SIZE:
			p->chunkSize = *(int *)pArg;
			return SQLITE_OK;

		case SQLITE_FCNTL_SIZE_HINT:
			return asyncTruncate(pFile, *(int64_t *)pArg);

		default:
			return SQLITE_NOTFOUND;
	};
}

static int asyncSectorSize(sqlite3_file *pFile){ return 512; }  // SOMEDAY: Would 4K be better?
static int asyncDeviceCharacteristics(sqlite3_file *pFile){ return 0; }

#if 1
	struct SharedMemoryInfo {  // for a file
		std::string filename;
		std::vector<void*> regions;
		int regionSize;
		int refcount;   // Number of connections with this open
		int sharedLocks[SQLITE_SHM_NLOCK];
		int exclusiveLocks[SQLITE_SHM_NLOCK];

		SharedMemoryInfo() : regionSize(0), refcount(0) {
			memset(sharedLocks, 0, sizeof(sharedLocks));
			memset(exclusiveLocks, 0, sizeof(exclusiveLocks));
		}
		void cleanup(){
			for(int i=0; i<regions.size(); i++)
				delete[] (uint8_t*)regions[i];
			table.erase(filename);
		}

		static Mutex mutex;
		static std::map< std::string, SharedMemoryInfo > table;
	};
	Mutex SharedMemoryInfo::mutex;
	std::map< std::string, SharedMemoryInfo > SharedMemoryInfo::table;

	/*
	** This function is called to obtain a pointer to region iRegion of the 
	** shared-memory associated with the database file fd. Shared-memory regions 
	** are numbered starting from zero. Each shared-memory region is szRegion 
	** bytes in size.
	**
	** If an error occurs, an error code is returned and *pp is set to NULL.
	**
	** Otherwise, if the bExtend parameter is 0 and the requested shared-memory
	** region has not been allocated (by any client, including one running in a
	** separate process), then *pp is set to NULL and SQLITE_OK returned. If 
	** bExtend is non-zero and the requested shared-memory region has not yet 
	** been allocated, it is allocated by this function.
	**
	** If the shared-memory region has already been allocated or is allocated by
	** this call as described above, then it is mapped into this processes 
	** address space (if it is not already), *pp is set to point to the mapped 
	** memory and SQLITE_OK returned.
	*/
	static int asyncShmMap(
	  sqlite3_file *fd,               /* Handle open on database file */
	  int iRegion,                    /* Region to retrieve */
	  int szRegion,                   /* Size of regions */
	  int bExtend,                    /* True to extend file if necessary */
	  void volatile **pp              /* OUT: Mapped memory */
	)
	{
	  MutexHolder hold( SharedMemoryInfo::mutex );

	  VFSAsyncFile *pDbFd = (VFSAsyncFile*)fd;
	  SharedMemoryInfo* memInfo = pDbFd->sharedMemory;
	  if (!memInfo) {
		  std::string filename = pDbFd->filename;
		  memInfo = pDbFd->sharedMemory = &SharedMemoryInfo::table[ filename ];
		  memInfo->filename = filename;
		  memInfo->regionSize = szRegion;
		  ++memInfo->refcount;
		  //printf("Shared memory for: '%s' (%d refs)\n", filename.c_str(), memInfo->refcount);
	  } else {
		  assert( memInfo->regionSize == szRegion );
	  }

	  if (iRegion >= memInfo->regions.size()) {
		  if (!bExtend) { *pp = NULL; return SQLITE_OK; }
		  while (memInfo->regions.size() <= iRegion) {
			  void *mem = new uint8_t[ szRegion ];
			  memset( mem, 0, szRegion );
			  memInfo->regions.push_back( mem );
		  }
	  }
	  *pp = memInfo->regions[ iRegion ];
	  return SQLITE_OK;
	}

	/*
	** Change the lock state for a shared-memory segment.
	**
	** Note that the relationship between SHAREd and EXCLUSIVE locks is a little
	** different here than in posix.  In xShmLock(), one can go from unlocked
	** to shared and back or from unlocked to exclusive and back.  But one may
	** not go from shared to exclusive or from exclusive to shared.
	*/
	// sqlite doesn't seem to match these up correctly - it happily calls unlock on locks it doesn't hold.
	// So we have to keep track of which locks are held by a given sqlite3_file
	static int asyncShmLock(
	  sqlite3_file *fd,          /* Database file holding the shared memory */
	  int ofst,                  /* First lock to acquire or release */
	  int n,                     /* Number of locks to acquire or release */
	  int flags                  /* What to do with the lock */
	){
	  assert( ofst>=0 && ofst+n<=SQLITE_SHM_NLOCK );
	  assert( n>=1 );
	  assert( flags==(SQLITE_SHM_LOCK | SQLITE_SHM_SHARED)
		   || flags==(SQLITE_SHM_LOCK | SQLITE_SHM_EXCLUSIVE)
		   || flags==(SQLITE_SHM_UNLOCK | SQLITE_SHM_SHARED)
		   || flags==(SQLITE_SHM_UNLOCK | SQLITE_SHM_EXCLUSIVE) );
	  assert( n==1 || (flags & SQLITE_SHM_EXCLUSIVE)!=0 );
	
	  MutexHolder hold( SharedMemoryInfo::mutex );

	  VFSAsyncFile *pDbFd = (VFSAsyncFile*)fd;
	  SharedMemoryInfo* memInfo = pDbFd->sharedMemory;

	  if (flags & SQLITE_SHM_UNLOCK) {
		  for(int i=ofst; i<ofst+n; i++) {
			  if ( pDbFd->sharedMemorySharedLocks & (1<<i) ) {
				  pDbFd->sharedMemorySharedLocks &= ~(1<<i);
				  --memInfo->sharedLocks[i];
			  }
			  if ( pDbFd->sharedMemoryExclusiveLocks & (1<<i) ) {
				  pDbFd->sharedMemoryExclusiveLocks &= ~(1<<i);
				  --memInfo->exclusiveLocks[i];
			  }
		  }
	  } else if (flags & SQLITE_SHM_SHARED) {
		  for(int i=ofst; i<ofst+n; i++)
			  if ( memInfo->exclusiveLocks[i] != ((pDbFd->sharedMemoryExclusiveLocks>>i)&1) ) {
				  //TraceEvent("ShmLocked").detail("File", DEBUG_DETERMINISM ? 0 : (int64_t)pDbFd).detail("Acquiring", "Shared").detail("I", i).detail("Exclusive", memInfo->exclusiveLocks[i]).detail("MyExclusive", pDbFd->sharedMemoryExclusiveLocks);
				  return SQLITE_BUSY;
			  }
		  for(int i=ofst; i<ofst+n; i++)
			  if ( !(pDbFd->sharedMemorySharedLocks & (1<<i)) ) {
				  pDbFd->sharedMemorySharedLocks |= 1<<i;
				  memInfo->sharedLocks[i]++;
			  }
	  } else {
		  for(int i=ofst; i<ofst+n; i++)
			  if ( memInfo->exclusiveLocks[i] != ((pDbFd->sharedMemoryExclusiveLocks>>i)&1) ||
				   memInfo->sharedLocks[i] != ((pDbFd->sharedMemorySharedLocks>>i)&1) )
			  {
				  //TraceEvent("ShmLocked").detail("File", DEBUG_DETERMINISM ? 0 : (int64_t)pDbFd).detail("Acquiring", "Exclusive").detail("I", i).detail("Exclusive", memInfo->exclusiveLocks[i]).detail("MyExclusive", pDbFd->sharedMemoryExclusiveLocks).detail("Shared", memInfo->sharedLocks[i]).detail("MyShared", pDbFd->sharedMemorySharedLocks);
				  return SQLITE_BUSY;
			  }
		  for(int i=ofst; i<ofst+n; i++)
			  if (!( pDbFd->sharedMemoryExclusiveLocks & (1<<i) )) {
				  pDbFd->sharedMemoryExclusiveLocks |= 1<<i;
				  memInfo->exclusiveLocks[i]++;
			  }
	  }
	  return SQLITE_OK;
	}

	/*
	** Implement a memory barrier or memory fence on shared memory.  
	**
	** All loads and stores begun before the barrier must complete before
	** any load or store begun after the barrier.
	*/
	static void asyncShmBarrier(sqlite3_file*){
#if WIN32
		_ReadWriteBarrier();
#else
		__sync_synchronize();
#endif
	}

	/*
	** Close a connection to shared-memory.  Delete the underlying 
	** storage if deleteFlag is true.
	**
	** If there is no shared memory associated with the connection then this
	** routine is a harmless no-op.
	*/
	static int asyncShmUnmap(
	  sqlite3_file *fd,               /* The underlying database file */
	  int deleteFlag                  /* Delete shared-memory if true */
	){
	  MutexHolder hold( SharedMemoryInfo::mutex );

	  VFSAsyncFile *pDbFd = (VFSAsyncFile*)fd;
	  SharedMemoryInfo* memInfo = pDbFd->sharedMemory;
	  if (!memInfo) return SQLITE_OK;
	  pDbFd->sharedMemory = 0;

	  //printf("Connection %p closed shared memory\n", fd);

	  if (!--memInfo->refcount) {
		  //printf("Cleanup shared memory for: '%s' (%d refs; deleteFlag=%d)\n", memInfo->filename.c_str(), memInfo->refcount, deleteFlag);
		  //printf("  Shared locks: "); for(int i=0; i<8; i++) printf("%d ", memInfo->sharedLocks[i]); printf("\n");
		  //printf("  Exclusive locks: "); for(int i=0; i<8; i++) printf("%d ", memInfo->exclusiveLocks[i]); printf("\n");

		  //TraceEvent("CleanupSharedMemory").detail("Filename", memInfo->filename.c_str()).detail("RefCount", memInfo->refcount).detail("DeleteFlag", deleteFlag);
		  //for(int i = 0; i < 8; i++)
			  //TraceEvent("CleanupSharedMemory_Locks").detail("Filename", memInfo->filename.c_str()).detail("Num", i).detail("Shared", memInfo->sharedLocks[i]).detail("Exclusive", memInfo->exclusiveLocks[i]);

		  //We don't think deleteFlag will ever be set
		  ASSERT(!deleteFlag);
	  }
	  return SQLITE_OK;
	}

	VFSAsyncFile::~VFSAsyncFile() {
		//TraceEvent("VFSAsyncFileDel").detail("Filename", filename);
		if (!--filename_lockCount_openCount[filename].second) {
			filename_lockCount_openCount.erase(filename);

			//Always delete the shared memory when the last copy of the file is deleted.  In simulation, this is helpful because "killing" a file without properly closing
			//it can result in a shared memory state that causes corruption when reopening the killed file.  The only expected penalty from doing this
			//is a potentially slower open operation on a database, but that should happen infrequently.
			//
			//We can't do this in ShmUnmap when refcount is 0 because it seems that SQLite sometimes subsequently tries to reopen the WAL from multiple locations simultaneously, 
			//resulting in a locking error
			auto itr = SharedMemoryInfo::table.find(filename);
			if(itr != SharedMemoryInfo::table.end()) {
				ASSERT_ABORT(itr->second.refcount == 0);
				itr->second.cleanup();
			}
		}
	}

#endif

/*
** Open a file handle.
*/
static int asyncOpen(
  sqlite3_vfs *pVfs,              /* VFS */
  const char *zName,              /* File to open, or 0 for a temp file */
  sqlite3_file *pFile,            /* Pointer to VFSAsyncFile struct to populate */
  int flags,                      /* Input SQLITE_OPEN_XXX flags */
  int *pOutFlags                  /* Output SQLITE_OPEN_XXX flags (or NULL) */
){
	static const sqlite3_io_methods asyncio = {
		3,                            /* iVersion */
		asyncClose,                    /* xClose */
		asyncRead,                     /* xRead */
		asyncWrite,                    /* xWrite */
		asyncTruncate,                 /* xTruncate */
		asyncSync,                     /* xSync */
		VFSAsyncFileSize,                 /* xFileSize */
		asyncLock,                     /* xLock */
		asyncUnlock,                   /* xUnlock */
		asyncCheckReservedLock,        /* xCheckReservedLock */
		VFSAsyncFileControl,              /* xFileControl */
		asyncSectorSize,               /* xSectorSize */
		asyncDeviceCharacteristics,     /* xDeviceCharacteristics */
		asyncShmMap,
		asyncShmLock,
		asyncShmBarrier,
		asyncShmUnmap,
		asyncReadZeroCopy,
		asyncReleaseZeroCopy
	};

	VFSAsyncFile *p = (VFSAsyncFile*)pFile; /* Populate this structure */

	if( zName==0 )
		return SQLITE_IOERR;

	static_assert( SQLITE_OPEN_EXCLUSIVE == IAsyncFile::OPEN_EXCLUSIVE && 
				   SQLITE_OPEN_CREATE == IAsyncFile::OPEN_CREATE &&
				   SQLITE_OPEN_READONLY == IAsyncFile::OPEN_READONLY &&
				   SQLITE_OPEN_READWRITE == IAsyncFile::OPEN_READWRITE, "SQLite flag values don't match IAsyncFile flag values" );

	// File creation here is disabled because we always create the files first in KeyValueStoreSQLite, using atomic creation
	int oflags = flags & (/*SQLITE_OPEN_EXCLUSIVE | SQLITE_OPEN_CREATE |*/ SQLITE_OPEN_READONLY | SQLITE_OPEN_READWRITE);
	if (flags & SQLITE_OPEN_WAL) oflags |= IAsyncFile::OPEN_LARGE_PAGES;
	oflags |= IAsyncFile::OPEN_LOCK;

	memset(p, 0, sizeof(VFSAsyncFile));
	new (p) VFSAsyncFile(zName, flags);
	try {
		// Note that SQLiteDB::open also opens the db file, so its flags and modes are important, too
		p->file = waitForAndGet( IAsyncFileSystem::filesystem()->open( p->filename, oflags, 0600 ) );

		/*TraceEvent("VFSOpened")
			.detail("Filename", p->filename)
			.detail("Fd", DEBUG_DETERMINISM ? 0 : p->file->debugFD())
			.detail("Flags", flags)
			.detail("Sqlite3File", DEBUG_DETERMINISM ? 0 : (int64_t)pFile)
			.detail("IAsyncFile", DEBUG_DETERMINISM ? 0 : (int64_t)p->file.getPtr());*/
	} catch (Error& e) {
		if(e.isInjectedFault()) {
			VFSAsyncFile::setOpenError();
		}
		TraceEvent("SQLiteOpenFail").error(e).detail("Filename", p->filename);
		p->~VFSAsyncFile();
		return SQLITE_CANTOPEN;
	}

	if( pOutFlags ){
		*pOutFlags = flags;
	}
	p->base.pMethods = &asyncio;
	return SQLITE_OK;
}

// The next few functions, which perform filesystem operations by path rather than by file, have
// OS-specific implementations.

/*
** Delete the file identified by argument zPath. If the dirSync parameter
** is non-zero, then ensure the file-system modification to delete the
** file has been synced to disk before returning.
*/
static int asyncDelete(sqlite3_vfs *pVfs, const char *zPath, int dirSync){
	ASSERT( false );  // At the moment this isn't used; hence isn't under test.  Could easily use IAsyncFileSystem::filesystem()->deleteFile().
	return SQLITE_IOERR_DELETE;
}

/*
** Query the file-system to see if the named file exists, is readable or
** is both readable and writable.  For an exists query, treat a zero-length file
** as if it does not exist.
*/
static int asyncAccess(
  sqlite3_vfs *pVfs, 
  const char *zPath, 
  int flags, 
  int *pResOut
){
#ifdef __unixish__
	#ifndef F_OK
	# define F_OK 0
	#endif
	#ifndef R_OK
	# define R_OK 4
	#endif
	#ifndef W_OK
	# define W_OK 2
	#endif
	int rc;                         /* access() return code */
	int eAccess = F_OK;             /* Second argument to access() */

	assert(flags==SQLITE_ACCESS_EXISTS       /* access(zPath, F_OK) */
		|| flags==SQLITE_ACCESS_READ         /* access(zPath, R_OK) */
		|| flags==SQLITE_ACCESS_READWRITE    /* access(zPath, R_OK|W_OK) */
	);

	if( flags==SQLITE_ACCESS_READWRITE ) eAccess = R_OK|W_OK;
	if( flags==SQLITE_ACCESS_READ )      eAccess = R_OK;

	rc = access(zPath, eAccess);
	*pResOut = (rc==0);

	if( flags==SQLITE_ACCESS_EXISTS && *pResOut ){
		struct stat buf;
		if( 0==stat(zPath, &buf) && buf.st_size==0 ){
			*pResOut = 0;
		}
	}
	return SQLITE_OK;
#else
	WIN32_FILE_ATTRIBUTE_DATA data;
	DWORD attr = INVALID_FILE_ATTRIBUTES;
	memset(&data, 0, sizeof(data));
	if (GetFileAttributesEx(zPath, GetFileExInfoStandard, &data)) {
		if (!(flags == SQLITE_ACCESS_EXISTS && data.nFileSizeHigh==0 && data.nFileSizeLow==0))
			attr = data.dwFileAttributes;
	} else if (GetLastError()!=ERROR_FILE_NOT_FOUND)
		return SQLITE_IOERR_ACCESS;

	if (flags == SQLITE_ACCESS_READWRITE)
		*pResOut = (attr & FILE_ATTRIBUTE_READONLY)==0;
	else
		*pResOut = attr != INVALID_FILE_ATTRIBUTES;
	return SQLITE_OK;
#endif
}

/*
** Argument zPath points to a nul-terminated string containing a file path.
** If zPath is an absolute path, then it is copied as is into the output 
** buffer. Otherwise, if it is a relative path, then the equivalent full
** path is written to the output buffer.
*/
static int asyncFullPathname(
  sqlite3_vfs *pVfs,              /* VFS */
  const char *zPath,              /* Input path (possibly a relative path) */
  int nPathOut,                   /* Size of output buffer in bytes */
  char *zPathOut                  /* Pointer to output buffer */
){
	try {
		auto s = abspath( zPath );
		if (s.size() >= nPathOut)
			return SQLITE_IOERR;
		memcpy(zPathOut, s.c_str(), s.size()+1);
		return SQLITE_OK;
	} catch (Error& e) {
		if(e.isInjectedFault()) {
			((VFSAsyncFile *)pVfs)->setInjectedError();
		}
		TraceEvent(SevError,"VFSAsyncFullPathnameError").error(e).detail("PathIn", (std::string)zPath);
		return SQLITE_IOERR;
	} catch(...) {
		TraceEvent(SevError,"VFSAsyncFullPathnameError").error(unknown_error()).detail("PathIn", (std::string)zPath);
		return SQLITE_IOERR;
	}
}

/*
** Returns true if there is a shared memory entry for the specified filename,
** and false otherwise.
*/
bool vfsAsyncIsOpen( std::string filename ) {
	return SharedMemoryInfo::table.count( abspath(filename) ) > 0;
}

/*
** The following four VFS methods:
**
**   xDlOpen
**   xDlError
**   xDlSym
**   xDlClose
**
** are supposed to implement the functionality needed by SQLite to load
** extensions compiled as shared objects. This simple VFS does not support
** this functionality, so the following functions are no-ops.
*/
static void *asyncDlOpen(sqlite3_vfs *pVfs, const char *zPath){
  return 0;
}
static void asyncDlError(sqlite3_vfs *pVfs, int nByte, char *zErrMsg){
  sqlite3_snprintf(nByte, zErrMsg, "Loadable extensions are not supported");
  zErrMsg[nByte-1] = '\0';
}
static void (*asyncDlSym(sqlite3_vfs *pVfs, void *pH, const char *z))(void){
  return 0;
}
static void asyncDlClose(sqlite3_vfs *pVfs, void *pHandle){
  return;
}

/*
** Parameter zByte points to a buffer nByte bytes in size. Populate this
** buffer with pseudo-random data.
*/
static int asyncRandomness(sqlite3_vfs *pVfs, int nByte, char *zByte){
  for(int i=0; i<nByte; i++)
	  zByte[i] = deterministicRandom()->randomInt(0,256);
  return SQLITE_OK;
}

/*
** Sleep for at least nMicro microseconds. Return the (approximate) number 
** of microseconds slept for.
*/
static int asyncSleep(sqlite3_vfs *pVfs, int microseconds){
	try {
		Future<Void> simCancel = Never();
		if( g_network->isSimulated() )
			simCancel = success( g_simulator.getCurrentProcess()->shutdownSignal.getFuture() );
		if( simCancel.isReady() ) {
			waitFor( delay(FLOW_KNOBS->MAX_BUGGIFIED_DELAY) );
			return 0;
		}
		waitFor( g_network->delay( microseconds*1e-6, TaskPriority::DefaultDelay ) || simCancel );
		return microseconds;
	} catch( Error &e ) {
		if(e.isInjectedFault()) {
			((VFSAsyncFile *)pVfs)->setInjectedError();
		}
		TraceEvent(SevError, "AsyncSleepError").error(e,true);
		return 0;
	}
}

/*
** Find the current time (in Universal Coordinated Time).  Write into *piNow
** the current time and date as a Julian Day number times 86_400_000.  In
** other words, write into *piNow the number of milliseconds since the Julian
** epoch of noon in Greenwich on November 24, 4714 B.C according to the
** proleptic Gregorian calendar.
**
** On success, return 0.  Return 1 if the time and date cannot be found.
*/
static int asyncCurrentTimeInt64(sqlite3_vfs *NotUsed, sqlite3_int64 *piNow){
#if __unixish__
	static const sqlite3_int64 unixEpoch = 24405875*(sqlite3_int64)8640000;
	struct timeval sNow;
	gettimeofday(&sNow, NULL);
	*piNow = unixEpoch + 1000*(sqlite3_int64)sNow.tv_sec + sNow.tv_usec/1000;
#elif defined(_WIN32)
	static const sqlite3_int64 winFiletimeEpoch = 23058135*(sqlite3_int64)8640000;
	int64_t ft = 0;
	GetSystemTimeAsFileTime( (FILETIME*)&ft );
	*piNow = winFiletimeEpoch + ft / 10000;
#else
#error Port me!
#endif
  return 0;
}

/*
** Set *pTime to the current UTC time expressed as a Julian day. Return
** SQLITE_OK if successful, or an error code otherwise.
**
**   http://en.wikipedia.org/wiki/Julian_day
*/
static int asyncCurrentTime(sqlite3_vfs *pVfs, double *pTime){
	sqlite3_int64 t = 0;
	int rc = asyncCurrentTimeInt64(pVfs, &t);
	if (rc) return rc;
	*pTime = t / 86400000.0;
	return SQLITE_OK;
}

static int asyncGetLastError(sqlite3_vfs *NotUsed, int NotUsed2, char *NotUsed3){ return 0; }

/*
** This function returns a pointer to the VFS implemented in this file.
** To make the VFS available to SQLite:
**
**   sqlite3_vfs_register(sqlite3_asyncvfs(), 0);
*/
sqlite3_vfs *vfsAsync(){
  static sqlite3_vfs asyncvfs = {
	3,                            /* iVersion */
	sizeof(VFSAsyncFile),         /* szOsFile */
	MAXPATHNAME,                  /* mxPathname */
	0,                            /* pNext */
	"fdb_async",                  /* zName */
	0,                            /* pAppData */
	asyncOpen,                    /* xOpen */
	asyncDelete,                  /* xDelete */
	asyncAccess,                  /* xAccess */
	asyncFullPathname,            /* xFullPathname */
	asyncDlOpen,                  /* xDlOpen */
	asyncDlError,                 /* xDlError */
	asyncDlSym,                   /* xDlSym */
	asyncDlClose,                 /* xDlClose */
	asyncRandomness,              /* xRandomness */
	asyncSleep,                   /* xSleep */
	asyncCurrentTime,             /* xCurrentTime */
	asyncGetLastError,     /* xGetLastError */
	asyncCurrentTimeInt64, /* xCurrentTimeInt64 */
	0,    /* xSetSystemCall */
	0,    /* xGetSystemCall */
	0,   /* xNextSystemCall */

  };
  return &asyncvfs;
}