diff --git a/TODO.md b/TODO.md index c5483538..61c8de9c 100644 --- a/TODO.md +++ b/TODO.md @@ -28,7 +28,7 @@ - ~~Filters~~ - ~~Facets~~ - ~~Schema validation during insertion (missing fields + type errors)~~ -- Proper score field for ranking tokens +- ~~Proper score field for ranking tokens~~ - Prevent string copy during indexing - clean special chars before indexing - Minimum results should be a variable instead of blindly going with max_results diff --git a/include/collection.h b/include/collection.h index f12fd2f2..d28b2d7a 100644 --- a/include/collection.h +++ b/include/collection.h @@ -33,6 +33,8 @@ private: spp::sparse_hash_map*> rank_index; + std::string token_ordering_field; + std::string get_doc_id_key(std::string doc_id); std::string get_seq_id_key(uint32_t seq_id); @@ -80,7 +82,7 @@ public: Collection(const std::string name, const uint32_t collection_id, const uint32_t next_seq_id, Store *store, const std::vector & search_fields, const std::vector & facet_fields, - const std::vector & rank_fields); + const std::vector & rank_fields, const std::string token_ordering_field); ~Collection(); @@ -102,6 +104,8 @@ public: spp::sparse_hash_map get_schema(); + std::string get_token_ordering_field(); + Option add(std::string json_str); nlohmann::json search(std::string query, const std::vector search_fields, diff --git a/include/collection_manager.h b/include/collection_manager.h index 2191ca8e..a71dc611 100644 --- a/include/collection_manager.h +++ b/include/collection_manager.h @@ -23,6 +23,7 @@ private: static constexpr const char* COLLECTION_SEARCH_FIELDS_KEY = "search_fields"; static constexpr const char* COLLECTION_FACET_FIELDS_KEY = "facet_fields"; static constexpr const char* COLLECTION_RANK_FIELDS_KEY = "rank_fields"; + static constexpr const char* COLLECTION_TOKEN_ORDERING_FIELD_KEY = "token_ordering_field"; CollectionManager(); @@ -41,7 +42,8 @@ public: Collection* create_collection(std::string name, const std::vector & search_fields, const std::vector & facet_fields, - const std::vector & rank_fields); + const std::vector & rank_fields, + const std::string & token_ordering_field = ""); Collection* get_collection(std::string collection_name); diff --git a/src/collection.cpp b/src/collection.cpp index 023b8c8f..e077a946 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -9,8 +9,9 @@ Collection::Collection(const std::string name, const uint32_t collection_id, const uint32_t next_seq_id, Store *store, const std::vector &search_fields, const std::vector & facet_fields, - const std::vector & rank_fields): - name(name), collection_id(collection_id), next_seq_id(next_seq_id), store(store), rank_fields(rank_fields) { + const std::vector & rank_fields, const std::string token_ordering_field): + name(name), collection_id(collection_id), next_seq_id(next_seq_id), store(store), + rank_fields(rank_fields), token_ordering_field(token_ordering_field) { for(const field& field: search_fields) { art_tree *t = new art_tree; @@ -79,9 +80,22 @@ Option Collection::add(std::string json_str) { } Option Collection::index_in_memory(const nlohmann::json &document, uint32_t seq_id) { + if(!token_ordering_field.empty() && document.count(token_ordering_field) == 0) { + return Option<>(400, "Field `" + token_ordering_field + "` has been declared as a token ordering field, " + "but is not found in the document."); + } + + if(!document[token_ordering_field].is_number()) { + return Option<>(400, "Token ordering field `" + token_ordering_field + "` must be an INT32."); + } + + if(document[token_ordering_field].get() > INT32_MAX) { + return Option<>(400, "Token ordering field `" + token_ordering_field + "` exceeds maximum value of INT32."); + } + uint32_t points = 0; - if(document.count("points") != 0) { - points = document["points"]; + if(!token_ordering_field.empty()) { + points = document[token_ordering_field]; } for(const std::pair & field_pair: search_schema) { @@ -1061,4 +1075,8 @@ std::string Collection::get_meta_key(std::string collection_name) { std::string Collection::get_seq_id_collection_prefix() { return std::to_string(collection_id) + "_" + std::string(SEQ_ID_PREFIX); +} + +std::string Collection::get_token_ordering_field() { + return token_ordering_field; } \ No newline at end of file diff --git a/src/collection_manager.cpp b/src/collection_manager.cpp index 96b2cafd..ca6d86c5 100644 --- a/src/collection_manager.cpp +++ b/src/collection_manager.cpp @@ -48,13 +48,16 @@ void CollectionManager::init(Store *store) { std::vector collection_rank_fields = collection_meta[COLLECTION_RANK_FIELDS_KEY].get>(); + std::string token_ordering_field = collection_meta[COLLECTION_TOKEN_ORDERING_FIELD_KEY].get(); + Collection* collection = new Collection(this_collection_name, collection_meta[COLLECTION_ID_KEY].get(), collection_next_seq_id, store, search_fields, facet_fields, - collection_rank_fields); + collection_rank_fields, + token_ordering_field); // Fetch records from the store and re-create memory index std::vector documents; @@ -79,7 +82,8 @@ void CollectionManager::init(Store *store) { Collection* CollectionManager::create_collection(std::string name, const std::vector & search_fields, const std::vector & facet_fields, - const std::vector & rank_fields) { + const std::vector & rank_fields, + const std::string & token_ordering_field) { if(store->contains(Collection::get_meta_key(name))) { return nullptr; } @@ -107,8 +111,10 @@ Collection* CollectionManager::create_collection(std::string name, const std::ve collection_meta[COLLECTION_SEARCH_FIELDS_KEY] = search_fields_json; collection_meta[COLLECTION_FACET_FIELDS_KEY] = facet_fields_json; collection_meta[COLLECTION_RANK_FIELDS_KEY] = rank_fields; - - Collection* new_collection = new Collection(name, next_collection_id, 0, store, search_fields, facet_fields, rank_fields); + collection_meta[COLLECTION_TOKEN_ORDERING_FIELD_KEY] = token_ordering_field; + + Collection* new_collection = new Collection(name, next_collection_id, 0, store, search_fields, facet_fields, + rank_fields, token_ordering_field); store->insert(Collection::get_meta_key(name), collection_meta.dump()); store->insert(Collection::get_next_seq_id_key(name), std::to_string(0)); diff --git a/test/collection_manager_test.cpp b/test/collection_manager_test.cpp index e1fc5c83..6d301322 100644 --- a/test/collection_manager_test.cpp +++ b/test/collection_manager_test.cpp @@ -26,7 +26,8 @@ protected: facet_fields = {field("starring", field_types::STRING)}; rank_fields = {"points"}; - collection1 = collectionManager.create_collection("collection1", search_fields, facet_fields, rank_fields); + collection1 = collectionManager.create_collection("collection1", search_fields, facet_fields, + rank_fields, "points"); } virtual void SetUp() { @@ -71,6 +72,7 @@ TEST_F(CollectionManagerTest, RestoreRecordsOnRestart) { ASSERT_EQ(facet_fields_expected, collection1->get_facet_fields()); ASSERT_EQ(rank_fields, collection1->get_rank_fields()); ASSERT_EQ(schema.size(), collection1->get_schema().size()); + ASSERT_EQ("points", collection1->get_token_ordering_field()); results = collection1->search("thomas", search_fields, "", facets, rank_fields, 0, 10, FREQUENCY, false); ASSERT_EQ(4, results["hits"].size()); diff --git a/test/collection_test.cpp b/test/collection_test.cpp index 4564ee1a..aff6a647 100644 --- a/test/collection_test.cpp +++ b/test/collection_test.cpp @@ -31,7 +31,8 @@ protected: collection = collectionManager.get_collection("collection"); if(collection == nullptr) { - collection = collectionManager.create_collection("collection", search_fields, facet_fields, rank_fields); + collection = collectionManager.create_collection("collection", search_fields, facet_fields, + rank_fields, "points"); } std::string json_line; @@ -787,23 +788,24 @@ TEST_F(CollectionTest, IndexingWithBadData) { // should not crash when document to-be-indexed doesn't match schema Collection *sample_collection; - std::vector fields = {field("name", field_types::STRING), field("age", field_types::INT32)}; + std::vector fields = {field("name", field_types::STRING)}; facet_fields = {field("tags", field_types::STRING_ARRAY)}; std::vector rank_fields = {"age", "average"}; sample_collection = collectionManager.get_collection("sample_collection"); if(sample_collection == nullptr) { - sample_collection = collectionManager.create_collection("sample_collection", fields, facet_fields, rank_fields); + sample_collection = collectionManager.create_collection("sample_collection", fields, facet_fields, + rank_fields, "age"); } - const Option & search_fields_missing_op1 = sample_collection->add("{\"namezz\": \"foo\"}"); + const Option & search_fields_missing_op1 = sample_collection->add("{\"namezz\": \"foo\", \"age\": 29}"); ASSERT_FALSE(search_fields_missing_op1.ok()); ASSERT_STREQ("Field `name` has been declared as a search field in the schema, but is not found in the document.", search_fields_missing_op1.error().c_str()); - const Option & search_fields_missing_op2 = sample_collection->add("{\"name\": \"foo\", \"agez\": 34}"); + const Option & search_fields_missing_op2 = sample_collection->add("{\"namez\": \"foo\", \"age\": 34}"); ASSERT_FALSE(search_fields_missing_op2.ok()); - ASSERT_STREQ("Field `age` has been declared as a search field in the schema, but is not found in the document.", + ASSERT_STREQ("Field `name` has been declared as a search field in the schema, but is not found in the document.", search_fields_missing_op2.error().c_str()); const Option & facet_fields_missing_op1 = sample_collection->add("{\"name\": \"foo\", \"age\": 34}"); @@ -830,9 +832,14 @@ TEST_F(CollectionTest, IndexingWithBadData) { ASSERT_TRUE(empty_facet_field_op.ok()); doc_str = "{\"name\": \"foo\", \"age\": \"34\", \"tags\": [], \"average\": 34 }"; - const Option & bad_search_field_op = sample_collection->add(doc_str); - ASSERT_FALSE(bad_search_field_op.ok()); - ASSERT_STREQ("Search field `age` must be an INT32.", bad_search_field_op.error().c_str()); + const Option & bad_token_ordering_field_op1 = sample_collection->add(doc_str); + ASSERT_FALSE(bad_token_ordering_field_op1.ok()); + ASSERT_STREQ("Token ordering field `age` must be an INT32.", bad_token_ordering_field_op1.error().c_str()); + + doc_str = "{\"name\": \"foo\", \"age\": 343234324234233234, \"tags\": [], \"average\": 34 }"; + const Option & bad_token_ordering_field_op2 = sample_collection->add(doc_str); + ASSERT_FALSE(bad_token_ordering_field_op2.ok()); + ASSERT_STREQ("Token ordering field `age` exceeds maximum value of INT32.", bad_token_ordering_field_op2.error().c_str()); doc_str = "{\"name\": \"foo\", \"age\": 34, \"tags\": [], \"average\": \"34\"}"; const Option & bad_rank_field_op = sample_collection->add(doc_str);