Speed up wildcard searches further.

2025-05-19 05:08:43 +08:00 · 2020-08-12 18:09:39 +05:30 · 2020-08-12 18:09:39 +05:30 · 14faa3af4e
commit 14faa3af4e
parent c70e83cfba
4 changed files with 30 additions and 11 deletions
--- a/TODO.md
+++ b/TODO.md
@ -2,6 +2,13 @@

 ## Pre-alpha

+a) ~~Fix memory ratio (decreasing with indexing)~~
+b) ~~Speed up wildcard searches further~~
+c) Allow int64 in default sorting field
+d) Use connection timeout for CURL rather than request timeout
+e) Update role to set max memory ration at 0.80
+f) Async import
+
 **Search index**

 - ~~Proper JSON as input~~
--- a/src/collection.cpp
+++ b/src/collection.cpp
@ -619,15 +619,6 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
        filters.push_back(f);
    }

-    // for a wildcard query, if filter is not specified, use default_sorting_field as a catch-all
-    if(query == "*" && filters.empty()) {
-        field f = search_schema.at(default_sorting_field);
-        std::string max_value = f.is_float() ? std::to_string(std::numeric_limits<float>::max()) :
-                                std::to_string(std::numeric_limits<int32_t>::max());
-        filter catch_all_filter = {f.name, {max_value}, LESS_THAN_EQUALS};
-        filters.push_back(catch_all_filter);
-    }
-
    // validate facet fields
    for(const std::string & field_name: facet_fields) {
        if(facet_schema.count(field_name) == 0) {
--- a/src/index.cpp
+++ b/src/index.cpp
@ -1164,11 +1164,32 @@ void Index::search(Option<uint32_t> & outcome,
        const uint8_t field_id = (uint8_t)(FIELD_LIMIT_NUM - 0);
        const std::string & field = search_fields[0];

+        // if a filter is not specified, use the sorting index to generate the list of all document ids
+        if(filters.empty()) {
+            std::string all_records_field;
+
+            // get the first non-optional field
+            for(const auto& kv: sort_schema) {
+                if(!kv.second.optional && kv.first != sort_field_const::text_match) {
+                    all_records_field = kv.first;
+                    break;
+                }
+            }
+
+            const spp::sparse_hash_map<uint32_t, int64_t> *kvs = sort_index[all_records_field];
+            filter_ids_length = kvs->size();
+            filter_ids = new uint32_t[filter_ids_length];
+
+            size_t i = 0;
+            for(const auto& kv: *kvs) {
+                filter_ids[i++] = kv.first;
+            }
+        }
+
        if(!curated_ids.empty()) {
            uint32_t *excluded_result_ids = nullptr;
            filter_ids_length = ArrayUtils::exclude_scalar(filter_ids, filter_ids_length, &curated_ids_sorted[0],
                                                     curated_ids.size(), &excluded_result_ids);
-
            delete [] filter_ids;
            filter_ids = excluded_result_ids;
        }
--- a/test/collection_test.cpp
+++ b/test/collection_test.cpp
@ -2259,7 +2259,7 @@ TEST_F(CollectionTest, OptionalFields) {

    infile.close();

-    // first must be able to fetch all records (i.e. all must have been index)
+    // first must be able to fetch all records (i.e. all must have been indexed)

    auto res = coll1->search("*", {"title"}, "", {}, {}, 0, 10, 1, FREQUENCY, false).get();
    ASSERT_EQ(6, res["found"].get<size_t>());