Initial sketch for persisting meta information about collections.

This commit is contained in:
Kishore Nallan 2017-01-08 19:47:17 +05:30
parent 2b6293650e
commit 2f08eca12e
11 changed files with 305 additions and 86 deletions

10
TODO.md
View File

@ -18,6 +18,7 @@
- string_utils::tokenize should not have max length
- only last token should be prefix searched
- art int search should support signed ints
- art float search
- storage key prefix should include collection name
- Minimum results should be a variable instead of blindly going with max_results
- Benchmark with -ffast-math
@ -26,6 +27,15 @@
- ~~Search across multiple fields~~
- Multi field search tests
- Throw errors when schema is broken
- Index and search on multi-valued field
- Fix documents.jsonl path in tests
- Assumption that all tokens match for scoring is no longer true
- Primary_rank_scores and secondary_rank_scores hashmaps should be combined
- Proper logging
- Have set inside topster itself
- Restore records as well on restart (like for meta)
- Persist next_seq_id
- collection_id should be int, not string
**API**

View File

@ -11,16 +11,18 @@
class Collection {
private:
Store* store;
std::string name;
std::string collection_id;
// Auto incrementing record ID used internally for indexing - not exposed to the client
uint32_t next_seq_id;
spp::sparse_hash_map<std::string, field> schema;
std::vector<std::string> rank_fields;
// Integer ID used internally for bitmaps - not exposed to the client
uint32_t seq_id;
Store* store;
spp::sparse_hash_map<std::string, art_tree*> index_map;
@ -28,16 +30,14 @@ private:
spp::sparse_hash_map<uint32_t, int64_t> secondary_rank_scores;
uint32_t next_seq_id();
const std::string SEQ_ID_PREFIX = "SQ_";
const std::string ID_PREFIX = "ID_";
const std::string META_PREFIX = "MT_";
const std::string FIELDS_KEY = META_PREFIX + "_fields";
// Using a $ prefix so that these keys stay at the top of a lexicographically ordered KV store
const std::string SEQ_ID_PREFIX = "$SI";
const std::string DOC_ID_PREFIX = "$DI";
std::string get_seq_id_key(uint32_t seq_id);
std::string get_id_key(std::string id);
std::string get_doc_id_key(std::string doc_id);
uint32_t get_next_seq_id();
static inline std::vector<art_leaf *> next_suggestion(const std::vector<std::vector<art_leaf *>> &token_leaves,
long long int n);
@ -57,10 +57,14 @@ private:
public:
Collection() = delete;
Collection(const std::string & state_dir_path, const std::string & name, const std::vector<field> & search_fields,
const std::vector<std::string> rank_fields);
Collection(const std::string name, const std::string collection_id, const uint32_t next_seq_id, Store *store,
const std::vector<field> & search_fields, const std::vector<std::string> & rank_fields);
~Collection();
std::string add(std::string json_str);
std::vector<nlohmann::json> search(std::string query, const std::vector<std::string> fields, const int num_typos,
const size_t num_results, const token_ordering token_order = FREQUENCY,
const bool prefix = false);

View File

@ -0,0 +1,51 @@
#pragma once
#include <iostream>
#include <string>
#include <sparsepp.h>
#include "store.h"
#include "field.h"
#include "collection.h"
// Singleton, for managing meta information of all collections and house keeping
class CollectionManager {
private:
Store *store;
spp::sparse_hash_map<std::string, Collection*> collections;
// Auto incrementing ID assigned to each collection
// Using a ID instead of a collection's name makes renaming possible
uint32_t next_collection_id;
const std::string NEXT_COLLECTION_ID_KEY = "$CI";
const std::string COLLECTION_NAME_PREFIX = "$CN";
const std::string COLLECTION_NAME_KEY = "name";
const std::string COLLECTION_ID_KEY = "id";
const std::string COLLECTION_NEXT_SEQ_ID_KEY = "next_seq_id";
const std::string COLLECTION_SEARCH_FIELDS_KEY = "search_fields";
const std::string COLLECTION_RANK_FIELDS_KEY = "rank_fields";
CollectionManager();
std::string get_collection_name_key(std::string name);
public:
static CollectionManager& get_instance() {
static CollectionManager instance;
return instance;
}
~CollectionManager();
CollectionManager(CollectionManager const&) = delete;
void operator=(CollectionManager const&) = delete;
void init(Store *store);
Collection* create_collection(std::string name, const std::vector<field> & search_fields,
const std::vector<std::string> & rank_fields);
Collection* get_collection(std::string collection_name);
};

View File

@ -1,15 +1,22 @@
#pragma once
#include <string>
enum field_type {
INT32,
STRING
};
namespace field_types {
static const std::string STRING = "STRING";
static const std::string INT32 = "INT32";
}
namespace fields {
static const std::string name = "name";
static const std::string type = "type";
}
struct field {
std::string name;
field_type type;
std::string type;
field(std::string name, field_type type): name(name), type(type) {
field(std::string name, std::string type): name(name), type(type) {
}
};

View File

@ -1,16 +1,15 @@
#pragma once
#include <stdint.h>
#include <cstdlib>
#include <string>
#include <rocksdb/db.h>
#include <rocksdb/options.h>
/*
* Stores all information about a collection.
* Uses RocksDB for persistence.
* Abstraction for underlying KV store (RocksDB)
*/
class Store {
private:
std::string state_dir_path;
@ -23,7 +22,7 @@ public:
Store() = delete;
Store(std::string state_dir_path): state_dir_path(state_dir_path) {
// Optimize RocksDB. This is the easiest way to get RocksDB to perform well
// Optimize RocksDB
options.IncreaseParallelism();
options.OptimizeLevelStyleCompaction();
// create the DB if it's not already present
@ -45,6 +44,12 @@ public:
return status.ok();
}
bool contains(const std::string& key) {
std::string value;
rocksdb::Status status = db->Get(rocksdb::ReadOptions(), key, &value);
return status.ok() && !status.IsNotFound();
}
bool get(const std::string& key, std::string& value) {
rocksdb::Status status = db->Get(rocksdb::ReadOptions(), key, &value);
return status.ok();
@ -55,6 +60,15 @@ public:
return status.ok();
}
void scan_fill(const std::string & prefix, std::vector<std::string> & values) {
rocksdb::Iterator *iter = db->NewIterator(rocksdb::ReadOptions());
for (iter->Seek(prefix); iter->Valid() && iter->key().starts_with(prefix); iter->Next()) {
values.push_back(iter->value().ToString());
}
delete iter;
}
void print_memory_usage() {
std::string index_usage;
db->GetProperty("rocksdb.estimate-table-readers-mem", &index_usage);

View File

@ -1,37 +1,24 @@
#include "collection.h"
#include <iostream>
#include <numeric>
#include <chrono>
#include <topster.h>
#include <intersection.h>
#include <match_score.h>
#include <string_utils.h>
#include <art.h>
#include "art.h"
#include "json.hpp"
Collection::Collection(const std::string & state_dir_path, const std::string & name, const std::vector<field> & search_fields,
const std::vector<std::string> rank_fields): seq_id(0), name(name), rank_fields(rank_fields) {
store = new Store(state_dir_path);
nlohmann::json fields_json = nlohmann::json::array();
Collection::Collection(const std::string name, const std::string collection_id, const uint32_t next_seq_id, Store *store,
const std::vector<field> &search_fields, const std::vector<std::string> & rank_fields):
name(name), collection_id(collection_id), next_seq_id(next_seq_id), store(store), rank_fields(rank_fields) {
for(const field& field: search_fields) {
art_tree *t = new art_tree;
art_tree_init(t);
fields_json.push_back(field.name);
index_map.emplace(field.name, t);
schema.emplace(field.name, field);
}
store->insert(FIELDS_KEY, fields_json.dump());
}
Collection::~Collection() {
delete store;
for(std::pair<std::string, field> name_field: schema) {
art_tree *t = index_map.at(name_field.first);
art_tree_destroy(t);
@ -40,14 +27,14 @@ Collection::~Collection() {
schema.clear();
}
uint32_t Collection::next_seq_id() {
return ++seq_id;
uint32_t Collection::get_next_seq_id() {
return ++next_seq_id;
}
std::string Collection::add(std::string json_str) {
nlohmann::json document = nlohmann::json::parse(json_str);
uint32_t seq_id = next_seq_id();
uint32_t seq_id = get_next_seq_id();
std::string seq_id_str = std::to_string(seq_id);
if(document.count("id") == 0) {
@ -55,15 +42,15 @@ std::string Collection::add(std::string json_str) {
}
store->insert(get_seq_id_key(seq_id), document.dump());
store->insert(get_id_key(document["id"]), seq_id_str);
store->insert(get_doc_id_key(document["id"]), seq_id_str);
for(const std::pair<std::string, field> & field_pair: schema) {
const std::string & field_name = field_pair.first;
art_tree *t = index_map.at(field_name);
if(field_pair.second.type == STRING) {
if(field_pair.second.type == field_types::STRING) {
index_string_field(field_name, t, document, seq_id);
} else if(field_pair.second.type == INT32) {
} else if(field_pair.second.type == field_types::INT32) {
index_int32_field(field_name, t, document, seq_id);
}
}
@ -201,6 +188,9 @@ void Collection::search_candidates(int & token_rank, std::vector<std::vector<art
std::vector<nlohmann::json> Collection::search(std::string query, const std::vector<std::string> fields,
const int num_typos, const size_t num_results,
const token_ordering token_order, const bool prefix) {
int size = index_map.size();
std::cout << "search size: " << size << std::endl;
// Order of `fields` are used to rank results
auto begin = std::chrono::high_resolution_clock::now();
std::vector<std::pair<int, Topster<100>::KV>> field_order_kvs;
@ -508,9 +498,9 @@ void _remove_and_shift_offset_index(forarray &offset_index, const uint32_t* indi
void Collection::remove(std::string id) {
std::string seq_id_str;
store->get(get_id_key(id), seq_id_str);
store->get(get_doc_id_key(id), seq_id_str);
uint32_t seq_id = (uint32_t) std::stoi(seq_id_str);
uint32_t seq_id = (uint32_t) std::stol(seq_id_str);
std::string parsed_document;
store->get(get_seq_id_key(seq_id), parsed_document);
@ -562,14 +552,14 @@ void Collection::remove(std::string id) {
}
}
store->remove(get_id_key(id));
store->remove(get_doc_id_key(id));
store->remove(get_seq_id_key(seq_id));
}
std::string Collection::get_seq_id_key(uint32_t seq_id) {
return SEQ_ID_PREFIX+std::to_string(seq_id);
return collection_id + "_" + SEQ_ID_PREFIX + std::to_string(seq_id);
}
std::string Collection::get_id_key(std::string id) {
return ID_PREFIX+id;
std::string Collection::get_doc_id_key(std::string doc_id) {
return collection_id + "_" + DOC_ID_PREFIX + doc_id;
}

View File

@ -0,0 +1,94 @@
#include <string>
#include <vector>
#include <json.hpp>
#include "collection_manager.h"
CollectionManager::CollectionManager() {
}
void CollectionManager::init(Store *store) {
this->store = store;
std::string next_collection_id_str;
store->get(NEXT_COLLECTION_ID_KEY, next_collection_id_str);
if(!next_collection_id_str.empty()) {
next_collection_id = (uint32_t) stoi(next_collection_id_str);
} else {
next_collection_id = 0;
store->insert(NEXT_COLLECTION_ID_KEY, std::to_string(next_collection_id));
}
std::vector<std::string> collection_meta_jsons;
store->scan_fill(COLLECTION_NAME_PREFIX, collection_meta_jsons);
for(auto collection_meta_json: collection_meta_jsons) {
nlohmann::json collection_meta = nlohmann::json::parse(collection_meta_json);
std::vector<field> search_fields;
nlohmann::json fields_map = collection_meta[COLLECTION_SEARCH_FIELDS_KEY];
for (nlohmann::json::iterator it = fields_map.begin(); it != fields_map.end(); ++it) {
search_fields.push_back({it.value()[fields::name], it.value()[fields::type]});
}
Collection* collection = new Collection(collection_meta[COLLECTION_NAME_KEY].get<std::string>(),
std::to_string(collection_meta[COLLECTION_ID_KEY].get<uint32_t>()),
collection_meta[COLLECTION_NEXT_SEQ_ID_KEY].get<uint32_t>(),
store,
search_fields,
collection_meta[COLLECTION_RANK_FIELDS_KEY].get<std::vector<std::string>>());
collections.emplace(get_collection_name_key(collection_meta[COLLECTION_NAME_KEY]), collection);
}
}
Collection* CollectionManager::create_collection(std::string name, const std::vector<field> & search_fields,
const std::vector<std::string> & rank_fields) {
if(store->contains(get_collection_name_key(name))) {
return nullptr;
}
nlohmann::json collection_meta;
nlohmann::json search_fields_json = nlohmann::json::array();;
for(const field& search_field: search_fields) {
nlohmann::json field_val;
field_val[fields::name] = search_field.name;
field_val[fields::type] = search_field.type;
search_fields_json.push_back(field_val);
}
collection_meta[COLLECTION_NAME_KEY] = name;
collection_meta[COLLECTION_ID_KEY] = next_collection_id;
collection_meta[COLLECTION_NEXT_SEQ_ID_KEY] = 0;
collection_meta[COLLECTION_SEARCH_FIELDS_KEY] = search_fields_json;
collection_meta[COLLECTION_RANK_FIELDS_KEY] = rank_fields;
store->insert(get_collection_name_key(name), collection_meta.dump());
std::string collection_id_str = std::to_string(next_collection_id);
Collection* new_collection = new Collection(name, collection_id_str, 0, store, search_fields, rank_fields);
next_collection_id++;
store->insert(NEXT_COLLECTION_ID_KEY, std::to_string(next_collection_id));
return new_collection;
}
std::string CollectionManager::get_collection_name_key(std::string collection_name) {
return COLLECTION_NAME_PREFIX + collection_name;
}
Collection* CollectionManager::get_collection(std::string collection_name) {
if(collections.count(get_collection_name_key(collection_name)) != 0) {
return collections.at(get_collection_name_key(collection_name));
}
return nullptr;
}
CollectionManager::~CollectionManager() {
for(auto kv: collections) {
delete kv.second;
}
}

View File

@ -9,37 +9,21 @@
#include <queue>
#include "string_utils.h"
#include "collection.h"
#include "collection_manager.h"
using namespace std;
int main() {
std::array<int, 10> s = {5, 7, 4, 2, 8, 6, 1, 9, 0, 3};
std::sort(s.begin(), s.end(), [](int a, int b) {
return a > b;
});
for (auto a : s) {
std::cout << a << " ";
}
std::cout << "\n\n\n";
auto cmp = [](int a, int b) { return a > b; };
std::priority_queue<int, std::vector<int>, decltype(cmp)> q(cmp);
for(int n : {1,8,5,6,3,4,0,9,7,2})
q.push(n);
while(!q.empty()) {
std::cout << q.top() << " ";
q.pop();
}
std::cout << '\n';
return 0;
std::vector<field> fields = {field("title", field_type::STRING)};
std::vector<field> fields_to_index = {field("title", field_types::STRING)};
std::vector<std::string> rank_fields = {"points"};
Collection *collection = new Collection("/tmp/typesense-data", "collection", fields, rank_fields);
Store *store = new Store("/tmp/typesense-data");
CollectionManager & collectionManager = CollectionManager::get_instance();
collectionManager.init(store);
Collection *collection = collectionManager.get_collection("collection");
if(collection == nullptr) {
collection = collectionManager.create_collection("collection", fields_to_index, rank_fields);
}
std::ifstream infile("/Users/kishore/others/wreally/typesense/test/documents.jsonl");
//std::ifstream infile("/Users/kishore/Downloads/hnstories.jsonl");
@ -60,6 +44,5 @@ int main() {
collection->search("the", search_fields, 1, 100);
long long int timeMillis = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::high_resolution_clock::now() - begin).count();
cout << "Time taken: " << timeMillis << "us" << endl;
delete collection;
return 0;
}

View File

@ -16,6 +16,7 @@
#include <regex>
#include "string_utils.h"
#include "collection.h"
#include "collection_manager.h"
#include <sys/resource.h>
#include "h2o.h"
@ -26,9 +27,12 @@
static h2o_globalconf_t config;
static h2o_context_t ctx;
static h2o_accept_ctx_t accept_ctx;
std::vector<field> fields = {field("title", field_type::STRING)};
std::vector<field> search_fields = {field("title", field_types::STRING)};
std::vector<std::string> rank_fields = {"points"};
static Collection *collection = new Collection("/tmp/typesense-data", "collection", fields, rank_fields);
Store *store = new Store("/tmp/typesense-data");
CollectionManager & collectionManager = CollectionManager::get_instance();
Collection *collection;
static h2o_pathconf_t *register_handler(h2o_hostconf_t *hostconf, const char *path,
int (*on_req)(h2o_handler_t *, h2o_req_t *)) {
@ -216,6 +220,12 @@ void index_documents() {
int main(int argc, char **argv) {
signal(SIGPIPE, SIG_IGN);
collectionManager.init(store);
collection = collectionManager.get_collection("collection");
if(collection == nullptr) {
collection = collectionManager.create_collection("collection", search_fields, rank_fields);
}
index_documents();
h2o_config_init(&config);
@ -236,6 +246,5 @@ int main(int argc, char **argv) {
while (h2o_evloop_run(ctx.loop) == 0);
delete collection;
return 0;
}

View File

@ -2,19 +2,33 @@
#include <string>
#include <vector>
#include <fstream>
#include <collection_manager.h>
#include "collection.h"
class CollectionTest : public ::testing::Test {
protected:
Collection *collection;
std::vector<std::string> search_fields;
Store *store;
CollectionManager & collectionManager = CollectionManager::get_instance();
void setupCollection() {
std::string state_dir_path = "/tmp/typesense_test/collection";
std::cout << "Truncating and creating: " << state_dir_path << std::endl;
system(("rm -rf "+state_dir_path+" && mkdir -p "+state_dir_path).c_str());
store = new Store(state_dir_path);
collectionManager.init(store);
virtual void SetUp() {
std::ifstream infile("/Users/kishore/others/wreally/typesense/test/documents.jsonl");
std::vector<field> fields = {field("title", field_type::STRING)};
std::vector<field> fields = {field("title", field_types::STRING)};
std::vector<std::string> rank_fields = {"points"};
search_fields = {"title"};
collection = new Collection("/tmp/typesense_test/collection", "collection", fields, rank_fields);
collection = collectionManager.get_collection("collection");
if(collection == nullptr) {
collection = collectionManager.create_collection("collection", fields, rank_fields);
}
std::string json_line;
@ -25,8 +39,12 @@ protected:
infile.close();
}
virtual void SetUp() {
setupCollection();
}
virtual void TearDown() {
delete collection;
delete store;
}
};
@ -272,4 +290,25 @@ TEST_F(CollectionTest, PrefixSearching) {
std::string id = ids.at(i);
ASSERT_STREQ(id.c_str(), result_id.c_str());
}
}
TEST_F(CollectionTest, MultipleFields) {
/*Collection *coll_mul_fields;
std::ifstream infile("/Users/kishore/others/wreally/typesense/test/multi_field_documents.jsonl");
std::vector<field> fields = {field("title", field_types::STRING), field("starring", field_types::STRING)};
std::vector<std::string> rank_fields = {"points"};
coll_mul_fields = new Collection("/tmp/typesense_test/coll_mul_fields", "coll_mul_fields", fields, rank_fields);
std::string json_line;
while (std::getline(infile, json_line)) {
coll_mul_fields->add(json_line);
}
infile.close();
search_fields = {"title", "starring"};
delete coll_mul_fields;*/
}

View File

@ -0,0 +1,18 @@
{"title: "Wake Up, Ron Burgundy: The Lost Movie", "starring": "Will Ferrell", "points": 62 }
{"title: "Anchorman 2: The Legend Continues", "starring": "Will Ferrell", "points": 63 }
{"title: "There Will Be Blood", "starring": "Daniel Day-Lewis", "points": 81 }
{"title: "Good Will Hunting", "starring": "Robin Williams", "points": 83 }
{"title: "The Adventures of Huck Finn", "starring": "Ron Perlman", "points": 58 }
{"title: "Percy Jackson: Sea of Monsters", "starring": "Ron Perlman", "points": 59 }
{"title: "Captain America: The Winter Soldier", "starring": "Samuel L. Jackson", "points": 78 }
{"title: "Quantum Quest: A Cassini Space Odyssey", "starring": "Samuel L. Jackson", "points": 52 }
{"title: "Scott Pilgrim vs. the World", "starring": "Michael Cera", "points": 75 }
{"title: "Homeland Security", "starring": "Scott Glenn", "points": 43 }
{"title: "The Paperboy", "starring": "Scott Glenn", "points": 58 }
{"title: "The Silence of the Lambs", "starring": "Scott Glenn", "points": 86 }
{"title: "Confessions of a Shopaholic", "starring": "Kristin Scott Thomas", "points": 59 }
{"title: "The Woman in the Fifth", "starring": "Kristin Scott Thomas", "points": 53 }
{"title: "Odd Thomas", "starring": "Matthew Page", "points": 69 }
{"title: "Suffering Man's Charity", "starring": "Henry Thomas", "points": 48 }
{"title: "The Gospel According to St. Matthew", "starring": "Paola Tedesco", "points": 79 }
{"title: "Halloween 5: The Revenge of Michael Myers", "starring": "Donald Pleasence", "points": 52 }