Use unorderd_map for low-volume meta datastructures.

Order of spp:sparse_hash_map during iteration is different in clang and gcc.
2025-05-18 04:32:38 +08:00 · 2017-12-20 06:45:21 +05:30 · 2017-12-20 06:45:21 +05:30 · 01275c38f2
commit 01275c38f2
parent 8d5f7c18a3
6 changed files with 29 additions and 21 deletions
--- a/include/collection.h
+++ b/include/collection.h
@ -3,6 +3,7 @@
 #include <string>
 #include <vector>
 #include <string>
+#include <unordered_map>
 #include <thread>
 #include <mutex>
 #include <condition_variable>
@ -35,11 +36,11 @@ private:

    std::vector<field> fields;

-    spp::sparse_hash_map<std::string, field> search_schema;
+    std::unordered_map<std::string, field> search_schema;

-    spp::sparse_hash_map<std::string, field> facet_schema;
+    std::unordered_map<std::string, field> facet_schema;

-    spp::sparse_hash_map<std::string, field> sort_schema;
+    std::unordered_map<std::string, field> sort_schema;

    Store* store;

@ -87,7 +88,7 @@ public:

    std::vector<field> get_fields();

-    spp::sparse_hash_map<std::string, field> get_schema();
+    std::unordered_map<std::string, field> get_schema();

    std::string get_token_ranking_field();

--- a/include/index.h
+++ b/include/index.h
@ -1,6 +1,7 @@
 #pragma once

 #include <string>
+#include <unordered_map>
 #include <vector>
 #include <mutex>
 #include <condition_variable>
@ -57,11 +58,11 @@ private:

    size_t num_documents;

-    spp::sparse_hash_map<std::string, field> search_schema;
+    std::unordered_map<std::string, field> search_schema;

-    spp::sparse_hash_map<std::string, field> facet_schema;
+    std::unordered_map<std::string, field> facet_schema;

-    spp::sparse_hash_map<std::string, field> sort_schema;
+    std::unordered_map<std::string, field> sort_schema;

    spp::sparse_hash_map<std::string, art_tree*> search_index;

@ -125,8 +126,8 @@ private:
 public:
    Index() = delete;

-    Index(const std::string name, spp::sparse_hash_map<std::string, field> search_schema,
-          spp::sparse_hash_map<std::string, field> facet_schema, spp::sparse_hash_map<std::string, field> sort_schema);
+    Index(const std::string name, std::unordered_map<std::string, field> search_schema,
+          std::unordered_map<std::string, field> facet_schema, std::unordered_map<std::string, field> sort_schema);

    ~Index();

--- a/src/collection.cpp
+++ b/src/collection.cpp
@ -6,6 +6,8 @@
 #include <match_score.h>
 #include <string_utils.h>
 #include <art.h>
+#include <thread>
+#include <chrono>

 Collection::Collection(const std::string name, const uint32_t collection_id, const uint32_t next_seq_id, Store *store,
                       const std::vector<field> &fields, const std::string & token_ranking_field):
@ -417,6 +419,7 @@ Option<nlohmann::json> Collection::search(std::string query, const std::vector<s
            index->processed = false;
        }
        index->cv.notify_one();
+        //std::this_thread::sleep_for(std::chrono::milliseconds(400));
    }

    Option<nlohmann::json> index_search_op({});  // stores the last error across all index threads
@ -739,7 +742,7 @@ std::vector<field> Collection::get_fields() {
    return fields;
 }

-spp::sparse_hash_map<std::string, field> Collection::get_schema() {
+std::unordered_map<std::string, field> Collection::get_schema() {
    return search_schema;
 };

--- a/src/index.cpp
+++ b/src/index.cpp
@ -2,13 +2,14 @@

 #include <numeric>
 #include <chrono>
+#include <unordered_map>
 #include <array_utils.h>
 #include <match_score.h>
 #include <string_utils.h>
 #include <art.h>

-Index::Index(const std::string name, spp::sparse_hash_map<std::string, field> search_schema,
-             spp::sparse_hash_map<std::string, field> facet_schema, spp::sparse_hash_map<std::string, field> sort_schema):
+Index::Index(const std::string name, std::unordered_map<std::string, field> search_schema,
+             std::unordered_map<std::string, field> facet_schema, std::unordered_map<std::string, field> sort_schema):
        name(name), search_schema(search_schema), facet_schema(facet_schema), sort_schema(sort_schema) {

    for(const auto pair: search_schema) {
@ -635,7 +636,7 @@ void Index::search_field(std::string & query, const std::string & field, uint32_
    spp::sparse_hash_map<std::string, std::vector<art_leaf*>> token_cost_cache;

    // Used to drop the least occurring token(s) for partial searches
-    spp::sparse_hash_map<std::string, uint32_t> token_to_count;
+    std::unordered_map<std::string, uint32_t> token_to_count;

    std::vector<std::vector<int>> token_to_costs;

@ -877,10 +878,12 @@ void Index::score_results(const std::vector<sort_by> & sort_fields, const int &
        const number_t & secondary_rank_value = secondary_rank_score * secondary_rank_factor;
        topster.add(seq_id, query_index, match_score, primary_rank_value, secondary_rank_value);

-        /*std::cout << name << ", total_cost: " << total_cost
-                  << ", words_present: " << mscore.words_present << ", match_score: " << match_score
-                  << ", primary_rank_score: " << primary_rank_score.intval << ", distance: " << mscore.distance
-                  << ", seq_id: " << seq_id << std::endl;*/
+        /*std::ostringstream os;
+        os << name << ", total_cost: " << (255 - total_cost)
+                << ", words_present: " << mscore.words_present << ", match_score: " << match_score
+                << ", primary_rank_score: " << primary_rank_score.intval << ", distance: " << (MAX_SEARCH_TOKENS - mscore.distance)
+                << ", seq_id: " << seq_id << std::endl;
+        std::cout << os.str();*/
    }

    //long long int timeNanos = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - begin).count();
--- a/test/collection_manager_test.cpp
+++ b/test/collection_manager_test.cpp
@ -47,7 +47,7 @@ TEST_F(CollectionManagerTest, CollectionCreation) {
    collection1 = collectionManager2.get_collection("collection1");
    ASSERT_NE(nullptr, collection1);

-    spp::sparse_hash_map<std::string, field> schema = collection1->get_schema();
+    std::unordered_map<std::string, field> schema = collection1->get_schema();
    std::vector<std::string> facet_fields_expected = {"cast"};

    ASSERT_EQ(0, collection1->get_collection_id());
@ -119,7 +119,7 @@ TEST_F(CollectionManagerTest, RestoreRecordsOnRestart) {
    nlohmann::json results = collection1->search("thomas", search_fields, "", facets, sort_fields, 0, 10, 1, FREQUENCY, false).get();
    ASSERT_EQ(4, results["hits"].size());

-    spp::sparse_hash_map<std::string, field> schema = collection1->get_schema();
+    std::unordered_map<std::string, field> schema = collection1->get_schema();

    // create a new collection manager to ensure that it restores the records from the disk backed store
    CollectionManager & collectionManager2 = CollectionManager::get_instance();
--- a/test/collection_test.cpp
+++ b/test/collection_test.cpp
@ -1424,10 +1424,10 @@ TEST_F(CollectionTest, IndexingWithBadData) {

    const Option<std::string> & search_fields_missing_op1 = sample_collection->add("{\"namezz\": \"foo\", \"age\": 29, \"average\": 78}");
    ASSERT_FALSE(search_fields_missing_op1.ok());
-    ASSERT_STREQ("Field `name` has been declared in the schema, but is not found in the document.",
+    ASSERT_STREQ("Field `tags` has been declared in the schema, but is not found in the document.",
                 search_fields_missing_op1.error().c_str());

-    const Option<std::string> & search_fields_missing_op2 = sample_collection->add("{\"namez\": \"foo\", \"age\": 34, \"average\": 78}");
+    const Option<std::string> & search_fields_missing_op2 = sample_collection->add("{\"namez\": \"foo\", \"tags\": [], \"age\": 34, \"average\": 78}");
    ASSERT_FALSE(search_fields_missing_op2.ok());
    ASSERT_STREQ("Field `name` has been declared in the schema, but is not found in the document.",
                 search_fields_missing_op2.error().c_str());