diff --git a/TODO.md b/TODO.md index ec7221aa..a40c2df3 100644 --- a/TODO.md +++ b/TODO.md @@ -18,6 +18,7 @@ - string_utils::tokenize should not have max length - only last token should be prefix searched - art int search should support signed ints +- art float search - storage key prefix should include collection name - Minimum results should be a variable instead of blindly going with max_results - Benchmark with -ffast-math @@ -26,6 +27,15 @@ - ~~Search across multiple fields~~ - Multi field search tests - Throw errors when schema is broken +- Index and search on multi-valued field +- Fix documents.jsonl path in tests +- Assumption that all tokens match for scoring is no longer true +- Primary_rank_scores and secondary_rank_scores hashmaps should be combined +- Proper logging +- Have set inside topster itself +- Restore records as well on restart (like for meta) +- Persist next_seq_id +- collection_id should be int, not string **API** diff --git a/include/collection.h b/include/collection.h index e25fccde..df72d090 100644 --- a/include/collection.h +++ b/include/collection.h @@ -11,16 +11,18 @@ class Collection { private: - Store* store; - std::string name; + std::string collection_id; + + // Auto incrementing record ID used internally for indexing - not exposed to the client + uint32_t next_seq_id; + spp::sparse_hash_map schema; std::vector rank_fields; - // Integer ID used internally for bitmaps - not exposed to the client - uint32_t seq_id; + Store* store; spp::sparse_hash_map index_map; @@ -28,16 +30,14 @@ private: spp::sparse_hash_map secondary_rank_scores; - uint32_t next_seq_id(); - - const std::string SEQ_ID_PREFIX = "SQ_"; - const std::string ID_PREFIX = "ID_"; - const std::string META_PREFIX = "MT_"; - - const std::string FIELDS_KEY = META_PREFIX + "_fields"; + // Using a $ prefix so that these keys stay at the top of a lexicographically ordered KV store + const std::string SEQ_ID_PREFIX = "$SI"; + const std::string DOC_ID_PREFIX = "$DI"; std::string get_seq_id_key(uint32_t seq_id); - std::string get_id_key(std::string id); + std::string get_doc_id_key(std::string doc_id); + + uint32_t get_next_seq_id(); static inline std::vector next_suggestion(const std::vector> &token_leaves, long long int n); @@ -57,10 +57,14 @@ private: public: Collection() = delete; - Collection(const std::string & state_dir_path, const std::string & name, const std::vector & search_fields, - const std::vector rank_fields); + + Collection(const std::string name, const std::string collection_id, const uint32_t next_seq_id, Store *store, + const std::vector & search_fields, const std::vector & rank_fields); + ~Collection(); + std::string add(std::string json_str); + std::vector search(std::string query, const std::vector fields, const int num_typos, const size_t num_results, const token_ordering token_order = FREQUENCY, const bool prefix = false); diff --git a/include/collection_manager.h b/include/collection_manager.h new file mode 100644 index 00000000..ba9915f1 --- /dev/null +++ b/include/collection_manager.h @@ -0,0 +1,51 @@ +#pragma once + +#include +#include +#include +#include "store.h" +#include "field.h" +#include "collection.h" + +// Singleton, for managing meta information of all collections and house keeping +class CollectionManager { +private: + Store *store; + + spp::sparse_hash_map collections; + + // Auto incrementing ID assigned to each collection + // Using a ID instead of a collection's name makes renaming possible + uint32_t next_collection_id; + + const std::string NEXT_COLLECTION_ID_KEY = "$CI"; + const std::string COLLECTION_NAME_PREFIX = "$CN"; + + const std::string COLLECTION_NAME_KEY = "name"; + const std::string COLLECTION_ID_KEY = "id"; + const std::string COLLECTION_NEXT_SEQ_ID_KEY = "next_seq_id"; + const std::string COLLECTION_SEARCH_FIELDS_KEY = "search_fields"; + const std::string COLLECTION_RANK_FIELDS_KEY = "rank_fields"; + + CollectionManager(); + + std::string get_collection_name_key(std::string name); + +public: + static CollectionManager& get_instance() { + static CollectionManager instance; + return instance; + } + + ~CollectionManager(); + + CollectionManager(CollectionManager const&) = delete; + void operator=(CollectionManager const&) = delete; + + void init(Store *store); + + Collection* create_collection(std::string name, const std::vector & search_fields, + const std::vector & rank_fields); + + Collection* get_collection(std::string collection_name); +}; \ No newline at end of file diff --git a/include/field.h b/include/field.h index 3aaa2394..c8c03e63 100644 --- a/include/field.h +++ b/include/field.h @@ -1,15 +1,22 @@ +#pragma once + #include -enum field_type { - INT32, - STRING -}; +namespace field_types { + static const std::string STRING = "STRING"; + static const std::string INT32 = "INT32"; +} + +namespace fields { + static const std::string name = "name"; + static const std::string type = "type"; +} struct field { std::string name; - field_type type; + std::string type; - field(std::string name, field_type type): name(name), type(type) { + field(std::string name, std::string type): name(name), type(type) { } }; \ No newline at end of file diff --git a/include/store.h b/include/store.h index 9f0273cb..62037d56 100644 --- a/include/store.h +++ b/include/store.h @@ -1,16 +1,15 @@ #pragma once #include +#include #include #include #include /* - * Stores all information about a collection. - * Uses RocksDB for persistence. + * Abstraction for underlying KV store (RocksDB) */ class Store { - private: std::string state_dir_path; @@ -23,7 +22,7 @@ public: Store() = delete; Store(std::string state_dir_path): state_dir_path(state_dir_path) { - // Optimize RocksDB. This is the easiest way to get RocksDB to perform well + // Optimize RocksDB options.IncreaseParallelism(); options.OptimizeLevelStyleCompaction(); // create the DB if it's not already present @@ -45,6 +44,12 @@ public: return status.ok(); } + bool contains(const std::string& key) { + std::string value; + rocksdb::Status status = db->Get(rocksdb::ReadOptions(), key, &value); + return status.ok() && !status.IsNotFound(); + } + bool get(const std::string& key, std::string& value) { rocksdb::Status status = db->Get(rocksdb::ReadOptions(), key, &value); return status.ok(); @@ -55,6 +60,15 @@ public: return status.ok(); } + void scan_fill(const std::string & prefix, std::vector & values) { + rocksdb::Iterator *iter = db->NewIterator(rocksdb::ReadOptions()); + for (iter->Seek(prefix); iter->Valid() && iter->key().starts_with(prefix); iter->Next()) { + values.push_back(iter->value().ToString()); + } + + delete iter; + } + void print_memory_usage() { std::string index_usage; db->GetProperty("rocksdb.estimate-table-readers-mem", &index_usage); diff --git a/src/collection.cpp b/src/collection.cpp index 969aaff0..a3d3f406 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -1,37 +1,24 @@ #include "collection.h" -#include #include #include -#include #include #include #include -#include -#include "art.h" -#include "json.hpp" -Collection::Collection(const std::string & state_dir_path, const std::string & name, const std::vector & search_fields, - const std::vector rank_fields): seq_id(0), name(name), rank_fields(rank_fields) { - store = new Store(state_dir_path); - - nlohmann::json fields_json = nlohmann::json::array(); +Collection::Collection(const std::string name, const std::string collection_id, const uint32_t next_seq_id, Store *store, + const std::vector &search_fields, const std::vector & rank_fields): + name(name), collection_id(collection_id), next_seq_id(next_seq_id), store(store), rank_fields(rank_fields) { for(const field& field: search_fields) { art_tree *t = new art_tree; art_tree_init(t); - - fields_json.push_back(field.name); index_map.emplace(field.name, t); schema.emplace(field.name, field); } - - store->insert(FIELDS_KEY, fields_json.dump()); } Collection::~Collection() { - delete store; - for(std::pair name_field: schema) { art_tree *t = index_map.at(name_field.first); art_tree_destroy(t); @@ -40,14 +27,14 @@ Collection::~Collection() { schema.clear(); } -uint32_t Collection::next_seq_id() { - return ++seq_id; +uint32_t Collection::get_next_seq_id() { + return ++next_seq_id; } std::string Collection::add(std::string json_str) { nlohmann::json document = nlohmann::json::parse(json_str); - uint32_t seq_id = next_seq_id(); + uint32_t seq_id = get_next_seq_id(); std::string seq_id_str = std::to_string(seq_id); if(document.count("id") == 0) { @@ -55,15 +42,15 @@ std::string Collection::add(std::string json_str) { } store->insert(get_seq_id_key(seq_id), document.dump()); - store->insert(get_id_key(document["id"]), seq_id_str); + store->insert(get_doc_id_key(document["id"]), seq_id_str); for(const std::pair & field_pair: schema) { const std::string & field_name = field_pair.first; art_tree *t = index_map.at(field_name); - if(field_pair.second.type == STRING) { + if(field_pair.second.type == field_types::STRING) { index_string_field(field_name, t, document, seq_id); - } else if(field_pair.second.type == INT32) { + } else if(field_pair.second.type == field_types::INT32) { index_int32_field(field_name, t, document, seq_id); } } @@ -201,6 +188,9 @@ void Collection::search_candidates(int & token_rank, std::vector Collection::search(std::string query, const std::vector fields, const int num_typos, const size_t num_results, const token_ordering token_order, const bool prefix) { + int size = index_map.size(); + std::cout << "search size: " << size << std::endl; + // Order of `fields` are used to rank results auto begin = std::chrono::high_resolution_clock::now(); std::vector::KV>> field_order_kvs; @@ -508,9 +498,9 @@ void _remove_and_shift_offset_index(forarray &offset_index, const uint32_t* indi void Collection::remove(std::string id) { std::string seq_id_str; - store->get(get_id_key(id), seq_id_str); + store->get(get_doc_id_key(id), seq_id_str); - uint32_t seq_id = (uint32_t) std::stoi(seq_id_str); + uint32_t seq_id = (uint32_t) std::stol(seq_id_str); std::string parsed_document; store->get(get_seq_id_key(seq_id), parsed_document); @@ -562,14 +552,14 @@ void Collection::remove(std::string id) { } } - store->remove(get_id_key(id)); + store->remove(get_doc_id_key(id)); store->remove(get_seq_id_key(seq_id)); } std::string Collection::get_seq_id_key(uint32_t seq_id) { - return SEQ_ID_PREFIX+std::to_string(seq_id); + return collection_id + "_" + SEQ_ID_PREFIX + std::to_string(seq_id); } -std::string Collection::get_id_key(std::string id) { - return ID_PREFIX+id; +std::string Collection::get_doc_id_key(std::string doc_id) { + return collection_id + "_" + DOC_ID_PREFIX + doc_id; } \ No newline at end of file diff --git a/src/collection_manager.cpp b/src/collection_manager.cpp new file mode 100644 index 00000000..63218cc7 --- /dev/null +++ b/src/collection_manager.cpp @@ -0,0 +1,94 @@ + +#include +#include +#include +#include "collection_manager.h" + +CollectionManager::CollectionManager() { + +} + +void CollectionManager::init(Store *store) { + this->store = store; + + std::string next_collection_id_str; + store->get(NEXT_COLLECTION_ID_KEY, next_collection_id_str); + if(!next_collection_id_str.empty()) { + next_collection_id = (uint32_t) stoi(next_collection_id_str); + } else { + next_collection_id = 0; + store->insert(NEXT_COLLECTION_ID_KEY, std::to_string(next_collection_id)); + } + + std::vector collection_meta_jsons; + store->scan_fill(COLLECTION_NAME_PREFIX, collection_meta_jsons); + + for(auto collection_meta_json: collection_meta_jsons) { + nlohmann::json collection_meta = nlohmann::json::parse(collection_meta_json); + + std::vector search_fields; + nlohmann::json fields_map = collection_meta[COLLECTION_SEARCH_FIELDS_KEY]; + + for (nlohmann::json::iterator it = fields_map.begin(); it != fields_map.end(); ++it) { + search_fields.push_back({it.value()[fields::name], it.value()[fields::type]}); + } + + Collection* collection = new Collection(collection_meta[COLLECTION_NAME_KEY].get(), + std::to_string(collection_meta[COLLECTION_ID_KEY].get()), + collection_meta[COLLECTION_NEXT_SEQ_ID_KEY].get(), + store, + search_fields, + collection_meta[COLLECTION_RANK_FIELDS_KEY].get>()); + collections.emplace(get_collection_name_key(collection_meta[COLLECTION_NAME_KEY]), collection); + } +} + +Collection* CollectionManager::create_collection(std::string name, const std::vector & search_fields, + const std::vector & rank_fields) { + if(store->contains(get_collection_name_key(name))) { + return nullptr; + } + + nlohmann::json collection_meta; + + nlohmann::json search_fields_json = nlohmann::json::array();; + for(const field& search_field: search_fields) { + nlohmann::json field_val; + field_val[fields::name] = search_field.name; + field_val[fields::type] = search_field.type; + search_fields_json.push_back(field_val); + } + + collection_meta[COLLECTION_NAME_KEY] = name; + collection_meta[COLLECTION_ID_KEY] = next_collection_id; + collection_meta[COLLECTION_NEXT_SEQ_ID_KEY] = 0; + collection_meta[COLLECTION_SEARCH_FIELDS_KEY] = search_fields_json; + collection_meta[COLLECTION_RANK_FIELDS_KEY] = rank_fields; + store->insert(get_collection_name_key(name), collection_meta.dump()); + + std::string collection_id_str = std::to_string(next_collection_id); + Collection* new_collection = new Collection(name, collection_id_str, 0, store, search_fields, rank_fields); + + next_collection_id++; + store->insert(NEXT_COLLECTION_ID_KEY, std::to_string(next_collection_id)); + + return new_collection; +} + +std::string CollectionManager::get_collection_name_key(std::string collection_name) { + return COLLECTION_NAME_PREFIX + collection_name; +} + +Collection* CollectionManager::get_collection(std::string collection_name) { + if(collections.count(get_collection_name_key(collection_name)) != 0) { + return collections.at(get_collection_name_key(collection_name)); + } + + return nullptr; +} + +CollectionManager::~CollectionManager() { + for(auto kv: collections) { + delete kv.second; + } +} diff --git a/src/main/main.cpp b/src/main/main.cpp index e01f1a70..87b6122a 100644 --- a/src/main/main.cpp +++ b/src/main/main.cpp @@ -9,37 +9,21 @@ #include #include "string_utils.h" #include "collection.h" +#include "collection_manager.h" using namespace std; int main() { - std::array s = {5, 7, 4, 2, 8, 6, 1, 9, 0, 3}; - std::sort(s.begin(), s.end(), [](int a, int b) { - return a > b; - }); - for (auto a : s) { - std::cout << a << " "; - } - - std::cout << "\n\n\n"; - - auto cmp = [](int a, int b) { return a > b; }; - std::priority_queue, decltype(cmp)> q(cmp); - - for(int n : {1,8,5,6,3,4,0,9,7,2}) - q.push(n); - - while(!q.empty()) { - std::cout << q.top() << " "; - q.pop(); - } - std::cout << '\n'; - - return 0; - - std::vector fields = {field("title", field_type::STRING)}; + std::vector fields_to_index = {field("title", field_types::STRING)}; std::vector rank_fields = {"points"}; - Collection *collection = new Collection("/tmp/typesense-data", "collection", fields, rank_fields); + Store *store = new Store("/tmp/typesense-data"); + CollectionManager & collectionManager = CollectionManager::get_instance(); + collectionManager.init(store); + + Collection *collection = collectionManager.get_collection("collection"); + if(collection == nullptr) { + collection = collectionManager.create_collection("collection", fields_to_index, rank_fields); + } std::ifstream infile("/Users/kishore/others/wreally/typesense/test/documents.jsonl"); //std::ifstream infile("/Users/kishore/Downloads/hnstories.jsonl"); @@ -60,6 +44,5 @@ int main() { collection->search("the", search_fields, 1, 100); long long int timeMillis = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - begin).count(); cout << "Time taken: " << timeMillis << "us" << endl; - delete collection; return 0; } \ No newline at end of file diff --git a/src/main/server.cpp b/src/main/server.cpp index 1b177a23..6a370e70 100644 --- a/src/main/server.cpp +++ b/src/main/server.cpp @@ -16,6 +16,7 @@ #include #include "string_utils.h" #include "collection.h" +#include "collection_manager.h" #include #include "h2o.h" @@ -26,9 +27,12 @@ static h2o_globalconf_t config; static h2o_context_t ctx; static h2o_accept_ctx_t accept_ctx; -std::vector fields = {field("title", field_type::STRING)}; +std::vector search_fields = {field("title", field_types::STRING)}; std::vector rank_fields = {"points"}; -static Collection *collection = new Collection("/tmp/typesense-data", "collection", fields, rank_fields); +Store *store = new Store("/tmp/typesense-data"); + +CollectionManager & collectionManager = CollectionManager::get_instance(); +Collection *collection; static h2o_pathconf_t *register_handler(h2o_hostconf_t *hostconf, const char *path, int (*on_req)(h2o_handler_t *, h2o_req_t *)) { @@ -216,6 +220,12 @@ void index_documents() { int main(int argc, char **argv) { signal(SIGPIPE, SIG_IGN); + collectionManager.init(store); + collection = collectionManager.get_collection("collection"); + if(collection == nullptr) { + collection = collectionManager.create_collection("collection", search_fields, rank_fields); + } + index_documents(); h2o_config_init(&config); @@ -236,6 +246,5 @@ int main(int argc, char **argv) { while (h2o_evloop_run(ctx.loop) == 0); - delete collection; return 0; } \ No newline at end of file diff --git a/test/collection_test.cpp b/test/collection_test.cpp index a87e5ec9..ce2d1696 100644 --- a/test/collection_test.cpp +++ b/test/collection_test.cpp @@ -2,19 +2,33 @@ #include #include #include +#include #include "collection.h" class CollectionTest : public ::testing::Test { protected: Collection *collection; std::vector search_fields; + Store *store; + CollectionManager & collectionManager = CollectionManager::get_instance(); + + void setupCollection() { + std::string state_dir_path = "/tmp/typesense_test/collection"; + std::cout << "Truncating and creating: " << state_dir_path << std::endl; + system(("rm -rf "+state_dir_path+" && mkdir -p "+state_dir_path).c_str()); + + store = new Store(state_dir_path); + collectionManager.init(store); - virtual void SetUp() { std::ifstream infile("/Users/kishore/others/wreally/typesense/test/documents.jsonl"); - std::vector fields = {field("title", field_type::STRING)}; + std::vector fields = {field("title", field_types::STRING)}; std::vector rank_fields = {"points"}; search_fields = {"title"}; - collection = new Collection("/tmp/typesense_test/collection", "collection", fields, rank_fields); + + collection = collectionManager.get_collection("collection"); + if(collection == nullptr) { + collection = collectionManager.create_collection("collection", fields, rank_fields); + } std::string json_line; @@ -25,8 +39,12 @@ protected: infile.close(); } + virtual void SetUp() { + setupCollection(); + } + virtual void TearDown() { - delete collection; + delete store; } }; @@ -272,4 +290,25 @@ TEST_F(CollectionTest, PrefixSearching) { std::string id = ids.at(i); ASSERT_STREQ(id.c_str(), result_id.c_str()); } +} + +TEST_F(CollectionTest, MultipleFields) { + /*Collection *coll_mul_fields; + + std::ifstream infile("/Users/kishore/others/wreally/typesense/test/multi_field_documents.jsonl"); + std::vector fields = {field("title", field_types::STRING), field("starring", field_types::STRING)}; + std::vector rank_fields = {"points"}; + coll_mul_fields = new Collection("/tmp/typesense_test/coll_mul_fields", "coll_mul_fields", fields, rank_fields); + + std::string json_line; + + while (std::getline(infile, json_line)) { + coll_mul_fields->add(json_line); + } + + infile.close(); + + search_fields = {"title", "starring"}; + + delete coll_mul_fields;*/ } \ No newline at end of file diff --git a/test/multi_field_documents.jsonl b/test/multi_field_documents.jsonl new file mode 100644 index 00000000..ccade46e --- /dev/null +++ b/test/multi_field_documents.jsonl @@ -0,0 +1,18 @@ +{"title: "Wake Up, Ron Burgundy: The Lost Movie", "starring": "Will Ferrell", "points": 62 } +{"title: "Anchorman 2: The Legend Continues", "starring": "Will Ferrell", "points": 63 } +{"title: "There Will Be Blood", "starring": "Daniel Day-Lewis", "points": 81 } +{"title: "Good Will Hunting", "starring": "Robin Williams", "points": 83 } +{"title: "The Adventures of Huck Finn", "starring": "Ron Perlman", "points": 58 } +{"title: "Percy Jackson: Sea of Monsters", "starring": "Ron Perlman", "points": 59 } +{"title: "Captain America: The Winter Soldier", "starring": "Samuel L. Jackson", "points": 78 } +{"title: "Quantum Quest: A Cassini Space Odyssey", "starring": "Samuel L. Jackson", "points": 52 } +{"title: "Scott Pilgrim vs. the World", "starring": "Michael Cera", "points": 75 } +{"title: "Homeland Security", "starring": "Scott Glenn", "points": 43 } +{"title: "The Paperboy", "starring": "Scott Glenn", "points": 58 } +{"title: "The Silence of the Lambs", "starring": "Scott Glenn", "points": 86 } +{"title: "Confessions of a Shopaholic", "starring": "Kristin Scott Thomas", "points": 59 } +{"title: "The Woman in the Fifth", "starring": "Kristin Scott Thomas", "points": 53 } +{"title: "Odd Thomas", "starring": "Matthew Page", "points": 69 } +{"title: "Suffering Man's Charity", "starring": "Henry Thomas", "points": 48 } +{"title: "The Gospel According to St. Matthew", "starring": "Paola Tedesco", "points": 79 } +{"title: "Halloween 5: The Revenge of Michael Myers", "starring": "Donald Pleasence", "points": 52 } \ No newline at end of file