Parameterize the token ordering field.

2025-05-18 12:42:50 +08:00 · 2017-03-26 21:26:01 +05:30 · 2017-03-26 21:26:01 +05:30 · 70dda716c5
commit 70dda716c5
parent fab27d9f5c
7 changed files with 60 additions and 21 deletions
--- a/TODO.md
+++ b/TODO.md
@ -28,7 +28,7 @@
 - ~~Filters~~
 - ~~Facets~~
 - ~~Schema validation during insertion (missing fields + type errors)~~
- Proper score field for ranking tokens
+- ~~Proper score field for ranking tokens~~
 - Prevent string copy during indexing
 - clean special chars before indexing
 - Minimum results should be a variable instead of blindly going with max_results
--- a/include/collection.h
+++ b/include/collection.h
@ -33,6 +33,8 @@ private:

    spp::sparse_hash_map<std::string, spp::sparse_hash_map<uint32_t, int64_t>*> rank_index;

+    std::string token_ordering_field;
+
    std::string get_doc_id_key(std::string doc_id);

    std::string get_seq_id_key(uint32_t seq_id);
@ -80,7 +82,7 @@ public:

    Collection(const std::string name, const uint32_t collection_id, const uint32_t next_seq_id, Store *store,
               const std::vector<field> & search_fields, const std::vector<field> & facet_fields,
-               const std::vector<std::string> & rank_fields);
+               const std::vector<std::string> & rank_fields, const std::string token_ordering_field);

    ~Collection();

@ -102,6 +104,8 @@ public:

    spp::sparse_hash_map<std::string, field> get_schema();

+    std::string get_token_ordering_field();
+
    Option<std::string> add(std::string json_str);

    nlohmann::json search(std::string query, const std::vector<std::string> search_fields,
--- a/include/collection_manager.h
+++ b/include/collection_manager.h
@ -23,6 +23,7 @@ private:
    static constexpr const char* COLLECTION_SEARCH_FIELDS_KEY = "search_fields";
    static constexpr const char* COLLECTION_FACET_FIELDS_KEY = "facet_fields";
    static constexpr const char* COLLECTION_RANK_FIELDS_KEY = "rank_fields";
+    static constexpr const char* COLLECTION_TOKEN_ORDERING_FIELD_KEY = "token_ordering_field";

    CollectionManager();

@ -41,7 +42,8 @@ public:

    Collection* create_collection(std::string name, const std::vector<field> & search_fields,
                                  const std::vector<field> & facet_fields,
-                                  const std::vector<std::string> & rank_fields);
+                                  const std::vector<std::string> & rank_fields,
+                                  const std::string & token_ordering_field = "");

    Collection* get_collection(std::string collection_name);

--- a/src/collection.cpp
+++ b/src/collection.cpp
@ -9,8 +9,9 @@

 Collection::Collection(const std::string name, const uint32_t collection_id, const uint32_t next_seq_id, Store *store,
                       const std::vector<field> &search_fields, const std::vector<field> & facet_fields,
-                       const std::vector<std::string> & rank_fields):
-    name(name), collection_id(collection_id), next_seq_id(next_seq_id), store(store), rank_fields(rank_fields) {
+                       const std::vector<std::string> & rank_fields, const std::string token_ordering_field):
+                       name(name), collection_id(collection_id), next_seq_id(next_seq_id), store(store),
+                       rank_fields(rank_fields), token_ordering_field(token_ordering_field) {

    for(const field& field: search_fields) {
        art_tree *t = new art_tree;
@ -79,9 +80,22 @@ Option<std::string> Collection::add(std::string json_str) {
 }

 Option<uint32_t> Collection::index_in_memory(const nlohmann::json &document, uint32_t seq_id) {
+    if(!token_ordering_field.empty() && document.count(token_ordering_field) == 0) {
+        return Option<>(400, "Field `" + token_ordering_field  + "` has been declared as a token ordering field, "
+                        "but is not found in the document.");
+    }
+
+    if(!document[token_ordering_field].is_number()) {
+        return Option<>(400, "Token ordering field `" + token_ordering_field  + "` must be an INT32.");
+    }
+
+    if(document[token_ordering_field].get<int64_t>() > INT32_MAX) {
+        return Option<>(400, "Token ordering field `" + token_ordering_field  + "` exceeds maximum value of INT32.");
+    }
+
    uint32_t points = 0;
-    if(document.count("points") != 0) {
-        points = document["points"];
+    if(!token_ordering_field.empty()) {
+        points = document[token_ordering_field];
    }

    for(const std::pair<std::string, field> & field_pair: search_schema) {
@ -1061,4 +1075,8 @@ std::string Collection::get_meta_key(std::string collection_name) {

 std::string Collection::get_seq_id_collection_prefix() {
    return std::to_string(collection_id) + "_" + std::string(SEQ_ID_PREFIX);
+}
+
+std::string Collection::get_token_ordering_field() {
+    return token_ordering_field;
 }
--- a/src/collection_manager.cpp
+++ b/src/collection_manager.cpp
@ -48,13 +48,16 @@ void CollectionManager::init(Store *store) {
        std::vector<std::string> collection_rank_fields =
                collection_meta[COLLECTION_RANK_FIELDS_KEY].get<std::vector<std::string>>();

+        std::string token_ordering_field = collection_meta[COLLECTION_TOKEN_ORDERING_FIELD_KEY].get<std::string>();
+
        Collection* collection = new Collection(this_collection_name,
                                                collection_meta[COLLECTION_ID_KEY].get<uint32_t>(),
                                                collection_next_seq_id,
                                                store,
                                                search_fields,
                                                facet_fields,
-                                                collection_rank_fields);
+                                                collection_rank_fields,
+                                                token_ordering_field);

        // Fetch records from the store and re-create memory index
        std::vector<std::string> documents;
@ -79,7 +82,8 @@ void CollectionManager::init(Store *store) {

 Collection* CollectionManager::create_collection(std::string name, const std::vector<field> & search_fields,
                                                 const std::vector<field> & facet_fields,
-                                                 const std::vector<std::string> & rank_fields) {
+                                                 const std::vector<std::string> & rank_fields,
+                                                 const std::string & token_ordering_field) {
    if(store->contains(Collection::get_meta_key(name))) {
        return nullptr;
    }
@ -107,8 +111,10 @@ Collection* CollectionManager::create_collection(std::string name, const std::ve
    collection_meta[COLLECTION_SEARCH_FIELDS_KEY] = search_fields_json;
    collection_meta[COLLECTION_FACET_FIELDS_KEY] = facet_fields_json;
    collection_meta[COLLECTION_RANK_FIELDS_KEY] = rank_fields;
-    
-    Collection* new_collection = new Collection(name, next_collection_id, 0, store, search_fields, facet_fields, rank_fields);
+    collection_meta[COLLECTION_TOKEN_ORDERING_FIELD_KEY] = token_ordering_field;
+
+    Collection* new_collection = new Collection(name, next_collection_id, 0, store, search_fields, facet_fields,
+                                                rank_fields, token_ordering_field);

    store->insert(Collection::get_meta_key(name), collection_meta.dump());
    store->insert(Collection::get_next_seq_id_key(name), std::to_string(0));
--- a/test/collection_manager_test.cpp
+++ b/test/collection_manager_test.cpp
@ -26,7 +26,8 @@ protected:
        facet_fields = {field("starring", field_types::STRING)};
        rank_fields = {"points"};

-        collection1 = collectionManager.create_collection("collection1", search_fields, facet_fields, rank_fields);
+        collection1 = collectionManager.create_collection("collection1", search_fields, facet_fields,
+                                                          rank_fields, "points");
    }

    virtual void SetUp() {
@ -71,6 +72,7 @@ TEST_F(CollectionManagerTest, RestoreRecordsOnRestart) {
    ASSERT_EQ(facet_fields_expected, collection1->get_facet_fields());
    ASSERT_EQ(rank_fields, collection1->get_rank_fields());
    ASSERT_EQ(schema.size(), collection1->get_schema().size());
+    ASSERT_EQ("points", collection1->get_token_ordering_field());

    results = collection1->search("thomas", search_fields, "", facets, rank_fields, 0, 10, FREQUENCY, false);
    ASSERT_EQ(4, results["hits"].size());
--- a/test/collection_test.cpp
+++ b/test/collection_test.cpp
@ -31,7 +31,8 @@ protected:

        collection = collectionManager.get_collection("collection");
        if(collection == nullptr) {
-            collection = collectionManager.create_collection("collection", search_fields, facet_fields, rank_fields);
+            collection = collectionManager.create_collection("collection", search_fields, facet_fields,
+                                                             rank_fields, "points");
        }

        std::string json_line;
@ -787,23 +788,24 @@ TEST_F(CollectionTest, IndexingWithBadData) {
    // should not crash when document to-be-indexed doesn't match schema
    Collection *sample_collection;

-    std::vector<field> fields = {field("name", field_types::STRING), field("age", field_types::INT32)};
+    std::vector<field> fields = {field("name", field_types::STRING)};
    facet_fields = {field("tags", field_types::STRING_ARRAY)};
    std::vector<std::string> rank_fields = {"age", "average"};

    sample_collection = collectionManager.get_collection("sample_collection");
    if(sample_collection == nullptr) {
-        sample_collection = collectionManager.create_collection("sample_collection", fields, facet_fields, rank_fields);
+        sample_collection = collectionManager.create_collection("sample_collection", fields, facet_fields,
+                                                                rank_fields, "age");
    }

-    const Option<std::string> & search_fields_missing_op1 = sample_collection->add("{\"namezz\": \"foo\"}");
+    const Option<std::string> & search_fields_missing_op1 = sample_collection->add("{\"namezz\": \"foo\", \"age\": 29}");
    ASSERT_FALSE(search_fields_missing_op1.ok());
    ASSERT_STREQ("Field `name` has been declared as a search field in the schema, but is not found in the document.",
                 search_fields_missing_op1.error().c_str());

-    const Option<std::string> & search_fields_missing_op2 = sample_collection->add("{\"name\": \"foo\", \"agez\": 34}");
+    const Option<std::string> & search_fields_missing_op2 = sample_collection->add("{\"namez\": \"foo\", \"age\": 34}");
    ASSERT_FALSE(search_fields_missing_op2.ok());
-    ASSERT_STREQ("Field `age` has been declared as a search field in the schema, but is not found in the document.",
+    ASSERT_STREQ("Field `name` has been declared as a search field in the schema, but is not found in the document.",
                 search_fields_missing_op2.error().c_str());

    const Option<std::string> & facet_fields_missing_op1 = sample_collection->add("{\"name\": \"foo\", \"age\": 34}");
@ -830,9 +832,14 @@ TEST_F(CollectionTest, IndexingWithBadData) {
    ASSERT_TRUE(empty_facet_field_op.ok());

    doc_str = "{\"name\": \"foo\", \"age\": \"34\", \"tags\": [], \"average\": 34 }";
-    const Option<std::string> & bad_search_field_op = sample_collection->add(doc_str);
-    ASSERT_FALSE(bad_search_field_op.ok());
-    ASSERT_STREQ("Search field `age` must be an INT32.", bad_search_field_op.error().c_str());
+    const Option<std::string> & bad_token_ordering_field_op1 = sample_collection->add(doc_str);
+    ASSERT_FALSE(bad_token_ordering_field_op1.ok());
+    ASSERT_STREQ("Token ordering field `age` must be an INT32.", bad_token_ordering_field_op1.error().c_str());
+
+    doc_str = "{\"name\": \"foo\", \"age\": 343234324234233234, \"tags\": [], \"average\": 34 }";
+    const Option<std::string> & bad_token_ordering_field_op2 = sample_collection->add(doc_str);
+    ASSERT_FALSE(bad_token_ordering_field_op2.ok());
+    ASSERT_STREQ("Token ordering field `age` exceeds maximum value of INT32.", bad_token_ordering_field_op2.error().c_str());

    doc_str = "{\"name\": \"foo\", \"age\": 34, \"tags\": [], \"average\": \"34\"}";
    const Option<std::string> & bad_rank_field_op = sample_collection->add(doc_str);