Highlight best matched string in an array.

An ARRAY_SEPARATOR delimiter is used to demarcate offsets of tokens from different indices of an array. Plain string fields are treated like a single-element array field, but without needing to use a delimiter.
2025-05-17 20:22:32 +08:00 · 2018-04-20 16:44:03 +05:30 · 2018-04-20 16:44:03 +05:30 · b669a47c29
commit b669a47c29
parent dea9df233f
7 changed files with 269 additions and 64 deletions
--- a/TODO.md
+++ b/TODO.md
@ -100,6 +100,8 @@
 - NOT operator support
 - Log operations
 - Parameterize replica's MAX_UPDATES_TO_SEND
+- NOT operator support
+- 64K token limit
 - > INT32_MAX validation for float field
 - highlight of string arrays?
 - test for token ranking on float field
--- a/include/index.h
+++ b/include/index.h
@ -84,10 +84,6 @@ private:

    void do_facets(std::vector<facet> & facets, uint32_t* result_ids, size_t results_size);

-    void populate_token_positions(const std::vector<art_leaf *> &query_suggestion,
-                                  spp::sparse_hash_map<const art_leaf *, uint32_t *> &leaf_to_indices,
-                                  size_t result_index, std::vector<std::vector<uint16_t>> &token_positions) const;
-
    void search_field(std::string & query, const std::string & field, uint32_t *filter_ids, size_t filter_ids_length,
                      std::vector<facet> & facets, const std::vector<sort_by> & sort_fields,
                      const int num_typos, const size_t num_results,
@ -102,6 +98,9 @@ private:
                           Topster<512> & topster, size_t & total_results, uint32_t** all_result_ids,
                           size_t & all_result_ids_len, const size_t & max_results, const bool prefix);

+    void insert_doc(const uint32_t score, art_tree *t, uint32_t seq_id,
+                    const std::unordered_map<std::string, std::vector<uint32_t>> &token_to_offsets) const;
+
    void index_string_field(const std::string & text, const uint32_t score, art_tree *t, uint32_t seq_id,
                            const bool verbatim) const;

@ -147,6 +146,11 @@ public:

    Option<uint32_t> remove(const uint32_t seq_id, nlohmann::json & document);

+    static void populate_token_positions(const std::vector<art_leaf *> &query_suggestion,
+                                         spp::sparse_hash_map<const art_leaf *, uint32_t *> &leaf_to_indices,
+                                         size_t result_index,
+                                         std::vector<std::vector<std::vector<uint16_t>>> &array_token_positions);
+
    void score_results(const std::vector<sort_by> & sort_fields, const int & query_index, const uint32_t total_cost,
                       Topster<512> &topster, const std::vector<art_leaf *> & query_suggestion,
                       const uint32_t *result_ids, const size_t result_size) const;
@ -162,6 +166,8 @@ public:
    // strings under this length will be fully highlighted, instead of showing a snippet of relevant portion
    enum {SNIPPET_STR_ABOVE_LEN = 30};

+    enum {ARRAY_SEPARATOR = UINT16_MAX};
+
    // Using a $ prefix so that these meta keys stay above record entries in a lexicographically ordered KV store
    static constexpr const char* COLLECTION_META_PREFIX = "$CM";
    static constexpr const char* COLLECTION_NEXT_SEQ_PREFIX = "$CS";
--- a/include/match_score.h
+++ b/include/match_score.h
@ -35,7 +35,7 @@ struct Match {
  uint16_t start_offset;
  char offset_diffs[16];

-  Match() {
+  Match(): words_present(0), distance(0), start_offset(0) {

  }

@ -44,6 +44,14 @@ struct Match {
    memcpy(offset_diffs, offset_diffs_stacked, 16);
  }

+  // Construct a single match score from individual components (for multi-field sort)
+  inline uint64_t get_match_score(const uint32_t total_cost) const {
+    uint64_t match_score = ((int64_t)(words_present) << 24) |
+                            ((int64_t)(255 - total_cost) << 16) |
+                            ((int64_t)(distance));
+    return match_score;
+  }
+
  static void print_token_offsets(std::vector<std::vector<uint16_t>> &token_offsets) {
    for(auto offsets: token_offsets) {
      for(auto offset: offsets) {
@ -54,7 +62,8 @@ struct Match {
  }

  static inline void addTopOfHeapToWindow(TokenOffsetHeap &heap, std::queue<TokenOffset> &window,
-                                           std::vector<std::vector<uint16_t>> &token_offsets, uint16_t *token_offset) {
+                                          const std::vector<std::vector<uint16_t>> &token_offsets,
+                                          uint16_t *token_offset) {
    TokenOffset top = heap.top();
    heap.pop();
    window.push(top);
@ -90,7 +99,7 @@ struct Match {
  *  We use a priority queue to read the offset vectors in a sorted manner, slide a window of a given size, and
  *  compute the max_match and min_displacement of target tokens across the windows.
  */
-  static Match match(uint32_t doc_id, std::vector<std::vector<uint16_t>> &token_offsets) {
+  static Match match(uint32_t doc_id, const std::vector<std::vector<uint16_t>> &token_offsets) {
    std::priority_queue<TokenOffset, std::vector<TokenOffset>, TokenOffset> heap;

    for(uint8_t token_id=0; token_id < token_offsets.size(); token_id++) {
--- a/src/collection.cpp
+++ b/src/collection.cpp
@ -545,14 +545,9 @@ Option<nlohmann::json> Collection::search(std::string query, const std::vector<s
        const std::string & field_name = search_fields[search_fields.size() - field_order_kv.first];
        field search_field = search_schema.at(field_name);

-        // only string fields are supported for now
-        if(search_field.type == field_types::STRING) {
-            std::vector<std::string> tokens;
-            StringUtils::split(document[field_name], tokens, " ");
-
-            // positions in the document of each token in the query
-            std::vector<std::vector<uint16_t>> token_positions;
+        if(search_field.type == field_types::STRING || search_field.type == field_types::STRING_ARRAY) {

+            spp::sparse_hash_map<const art_leaf*, uint32_t*> leaf_to_indices;
            for (const art_leaf *token_leaf : searched_queries[field_order_kv.second.query_index]) {
                std::vector<uint16_t> positions;
                uint32_t doc_index = token_leaf->values->ids.indexOf(field_order_kv.second.key);
@ -560,20 +555,42 @@ Option<nlohmann::json> Collection::search(std::string query, const std::vector<s
                    continue;
                }

-                uint32_t start_offset = token_leaf->values->offset_index.at(doc_index);
-                uint32_t end_offset = (doc_index == token_leaf->values->ids.getLength() - 1) ?
-                                      token_leaf->values->offsets.getLength() :
-                                      token_leaf->values->offset_index.at(doc_index+1);
-
-                while(start_offset < end_offset) {
-                    positions.push_back((uint16_t) token_leaf->values->offsets.at(start_offset));
-                    start_offset++;
-                }
-
-                token_positions.push_back(positions);
+                uint32_t *indices = new uint32_t[1];
+                indices[0] = doc_index;
+                leaf_to_indices.emplace(token_leaf, indices);
            }

-            Match match = Match::match(field_order_kv.second.key, token_positions);
+            // positions in the field of each token in the query
+            std::vector<std::vector<std::vector<uint16_t>>> array_token_positions;
+            Index::populate_token_positions(searched_queries[field_order_kv.second.query_index],
+                                            leaf_to_indices, 0, array_token_positions);
+
+            Match match;
+            uint64_t match_score = 0;
+            size_t matched_array_index = 0;
+
+            for(size_t array_index = 0; array_index < array_token_positions.size(); array_index++) {
+                const std::vector<std::vector<uint16_t>> & token_positions = array_token_positions[array_index];
+
+                if(token_positions.empty()) {
+                    continue;
+                }
+
+                const Match & this_match = Match::match(field_order_kv.second.key, token_positions);
+                uint64_t this_match_score = this_match.get_match_score(1);
+                if(this_match_score > match_score) {
+                    match_score = this_match_score;
+                    match = this_match;
+                    matched_array_index = array_index;
+                }
+            }
+
+            std::vector<std::string> tokens;
+            if(search_field.type == field_types::STRING) {
+                StringUtils::split(document[field_name], tokens, " ");
+            } else {
+                StringUtils::split(document[field_name][matched_array_index], tokens, " ");
+            }

            // unpack `match.offset_diffs` into `token_indices`
            std::vector<size_t> token_indices;
@ -609,6 +626,11 @@ Option<nlohmann::json> Collection::search(std::string query, const std::vector<s

            wrapper_doc["highlight"] = nlohmann::json::object();
            wrapper_doc["highlight"][field_name] = snippet_stream.str();
+
+            for (auto it = leaf_to_indices.begin(); it != leaf_to_indices.end(); it++) {
+                delete [] it->second;
+                it->second = nullptr;
+            }
        }

        result["hits"].push_back(wrapper_doc);
--- a/src/index.cpp
+++ b/src/index.cpp
@ -225,6 +225,7 @@ void Index::index_string_field(const std::string & text, const uint32_t score, a
        token_to_offsets[text].push_back(0);
    } else {
        StringUtils::split(text, tokens, " ");
+
        for(uint32_t i=0; i<tokens.size(); i++) {
            auto & token = tokens[i];
            string_utils.unicode_normalize(token);
@ -232,6 +233,11 @@ void Index::index_string_field(const std::string & text, const uint32_t score, a
        }
    }

+    insert_doc(score, t, seq_id, token_to_offsets);
+}
+
+void Index::insert_doc(const uint32_t score, art_tree *t, uint32_t seq_id,
+                       const std::unordered_map<std::string, std::vector<uint32_t>> &token_to_offsets) const {
    for(auto & kv: token_to_offsets) {
        art_document art_doc;
        art_doc.id = seq_id;
@ -263,9 +269,33 @@ void Index::index_string_field(const std::string & text, const uint32_t score, a

 void Index::index_string_array_field(const std::vector<std::string> & strings, const uint32_t score, art_tree *t,
                                          uint32_t seq_id, const bool verbatim) const {
-    for(const std::string & str: strings) {
-        index_string_field(str, score, t, seq_id, verbatim);
+    std::unordered_map<std::string, std::unordered_map<size_t, std::vector<uint32_t>>> token_array_positions;
+
+    for(size_t array_index = 0; array_index < strings.size(); array_index++) {
+        const std::string & str = strings[array_index];
+
+        std::vector<std::string> tokens;
+        StringUtils::split(str, tokens, " ");
+
+        for(uint32_t i=0; i<tokens.size(); i++) {
+            auto & token = tokens[i];
+            string_utils.unicode_normalize(token);
+            token_array_positions[token][array_index].push_back(i);
+        }
    }
+
+    std::unordered_map<std::string, std::vector<uint32_t>> token_to_offsets;
+
+    for(const auto & kv: token_array_positions) {
+        for(size_t array_index = 0; array_index < strings.size(); array_index++) {
+            token_to_offsets[kv.first].insert(token_to_offsets[kv.first].end(),
+                                              token_array_positions[kv.first][array_index].begin(),
+                                              token_array_positions[kv.first][array_index].end());
+            token_to_offsets[kv.first].push_back(ARRAY_SEPARATOR);
+        }
+    }
+
+    insert_doc(score, t, seq_id, token_to_offsets);
 }

 void Index::index_int32_array_field(const std::vector<int32_t> & values, const uint32_t score, art_tree *t,
@ -850,9 +880,7 @@ void Index::score_results(const std::vector<sort_by> & sort_fields, const int &
    char empty_offset_diffs[16];
    std::fill_n(empty_offset_diffs, 16, 0);
    Match single_token_match = Match(1, 0, 0, empty_offset_diffs);
-    const uint64_t single_token_match_score = ((int64_t)(single_token_match.words_present) << 24) |
-                                              ((int64_t)(255 - total_cost) << 16) |
-                                              ((int64_t)(single_token_match.distance));
+    const uint64_t single_token_match_score = single_token_match.get_match_score(total_cost);

    for(size_t i=0; i<result_size; i++) {
        const uint32_t seq_id = result_ids[i];
@ -862,14 +890,28 @@ void Index::score_results(const std::vector<sort_by> & sort_fields, const int &
        if(query_suggestion.size() == 1) {
            match_score = single_token_match_score;
        } else {
-            std::vector<std::vector<uint16_t>> token_positions;
-            populate_token_positions(query_suggestion, leaf_to_indices, i, token_positions);
-            const Match & match = Match::match(seq_id, token_positions);
+            std::vector<std::vector<std::vector<uint16_t>>> array_token_positions;
+            populate_token_positions(query_suggestion, leaf_to_indices, i, array_token_positions);

-            // Construct a single match score from individual components (for multi-field sort)
-            match_score = ((int64_t)(match.words_present) << 24) |
-                          ((int64_t)(255 - total_cost) << 16) |
-                          ((int64_t)(match.distance));
+            for(const std::vector<std::vector<uint16_t>> & token_positions: array_token_positions) {
+                if(token_positions.size() == 0) {
+                    continue;
+                }
+                const Match & match = Match::match(seq_id, token_positions);
+                uint64_t this_match_score = match.get_match_score(total_cost);
+
+                if(this_match_score > match_score) {
+                    match_score = this_match_score;
+                }
+
+                /*std::ostringstream os;
+                os << name << ", total_cost: " << (255 - total_cost)
+                   << ", words_present: " << match.words_present
+                   << ", match_score: " << match_score
+                   << ", match.distance: " << match.distance
+                   << ", seq_id: " << seq_id << std::endl;
+                std::cout << os.str();*/
+            }
        }

        const int64_t default_score = 0;
@ -889,15 +931,6 @@ void Index::score_results(const std::vector<sort_by> & sort_fields, const int &
        const number_t & primary_rank_value = primary_rank_score * primary_rank_factor;
        const number_t & secondary_rank_value = secondary_rank_score * secondary_rank_factor;
        topster.add(seq_id, query_index, match_score, primary_rank_value, secondary_rank_value);
-
-        /*
-        std::ostringstream os;
-        os << name << ", total_cost: " << (255 - total_cost)
-           << ", words_present: " << match.words_present << ", match_score: " << match_score
-           << ", match.distance: " << match.distance
-           << ", seq_id: " << seq_id << std::endl;
-        LOG(INFO) << os.str();
-        */
    }

    //long long int timeNanos = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - begin).count();
@ -910,28 +943,82 @@ void Index::score_results(const std::vector<sort_by> & sort_fields, const int &
 }

 void Index::populate_token_positions(const std::vector<art_leaf *> &query_suggestion,
-                                          spp::sparse_hash_map<const art_leaf *, uint32_t *> &leaf_to_indices,
-                                          size_t result_index, std::vector<std::vector<uint16_t>> &token_positions) const {
-    // for each token in the query, find the positions that it appears in this document
+                                     spp::sparse_hash_map<const art_leaf *, uint32_t *> &leaf_to_indices,
+                                     size_t result_index,
+                                     std::vector<std::vector<std::vector<uint16_t>>> &array_token_positions) {
+
+    // array_token_positions:
+    // for every element in a potential array, for every token in query suggestion, get the positions
+
+    // first let's ascertain the size of the array
+    size_t array_size = 0;
+
    for (const art_leaf *token_leaf : query_suggestion) {
-            std::vector<uint16_t> positions;
-            uint32_t doc_index = leaf_to_indices.at(token_leaf)[result_index];
-            if(doc_index == token_leaf->values->ids.getLength()) {
+        uint32_t doc_index = leaf_to_indices.at(token_leaf)[result_index];
+        if(doc_index == token_leaf->values->ids.getLength()) {
+            continue;
+        }
+
+        uint32_t start_offset = token_leaf->values->offset_index.at(doc_index);
+        uint32_t end_offset = (doc_index == token_leaf->values->ids.getLength() - 1) ?
+                              token_leaf->values->offsets.getLength() :
+                              token_leaf->values->offset_index.at(doc_index+1);
+
+        while(start_offset < end_offset) {
+            uint16_t pos = (uint16_t) token_leaf->values->offsets.at(start_offset);
+            if(pos == ARRAY_SEPARATOR) {
+                array_size++;
+            }
+            start_offset++;
+        }
+
+        if(array_size == 0) {
+            // for plain string fields that don't use an ARRAY_SEPARATOR
+            array_size = 1;
+        }
+
+        break;
+    }
+
+    // initialize array_token_positions
+    array_token_positions = std::vector<std::vector<std::vector<uint16_t>>>(array_size);
+
+    // for each token in the query, find the positions that it appears in the array
+    for (const art_leaf *token_leaf : query_suggestion) {
+        uint32_t doc_index = leaf_to_indices.at(token_leaf)[result_index];
+        if(doc_index == token_leaf->values->ids.getLength()) {
+            continue;
+        }
+
+        uint32_t start_offset = token_leaf->values->offset_index.at(doc_index);
+        uint32_t end_offset = (doc_index == token_leaf->values->ids.getLength() - 1) ?
+                              token_leaf->values->offsets.getLength() :
+                              token_leaf->values->offset_index.at(doc_index+1);
+
+        size_t array_index = 0;
+        std::vector<uint16_t> positions;
+
+        while(start_offset < end_offset) {
+            uint16_t pos = (uint16_t) token_leaf->values->offsets.at(start_offset);
+            start_offset++;
+
+            if(pos == ARRAY_SEPARATOR) {
+                if(positions.size() != 0) {
+                    array_token_positions[array_index].push_back(positions);
+                    positions.clear();
+                }
+                array_index++;
                continue;
            }

-            uint32_t start_offset = token_leaf->values->offset_index.at(doc_index);
-            uint32_t end_offset = (doc_index == token_leaf->values->ids.getLength() - 1) ?
-                                  token_leaf->values->offsets.getLength() :
-                                  token_leaf->values->offset_index.at(doc_index+1);
-
-            while(start_offset < end_offset) {
-                positions.push_back((uint16_t) token_leaf->values->offsets.at(start_offset));
-                start_offset++;
-            }
-
-            token_positions.push_back(positions);
+            positions.push_back(pos);
        }
+
+        if(positions.size() != 0) {
+            // for plain string fields that don't use an ARRAY_SEPARATOR
+            array_token_positions[array_index].push_back(positions);
+        }
+    }
 }

 inline std::vector<art_leaf *> Index::next_suggestion(const std::vector<token_candidates> &token_candidates_vec,
--- a/test/array_text_documents.jsonl
+++ b/test/array_text_documents.jsonl
@ -0,0 +1,3 @@
+{"title": "The Truth About Forever", "tags": ["the truth", "about forever", "truth about"], "points": 100}
+{"title": "Plain Truth", "tags": ["plain", "truth", "plain truth"], "points": 40}
+{"title": "Temple of the Winds", "tags": ["temple", "of", "temple of"], "points": 87}
--- a/test/collection_test.cpp
+++ b/test/collection_test.cpp
@ -487,6 +487,82 @@ TEST_F(CollectionTest, PrefixSearching) {
    ASSERT_EQ("16", results["hits"].at(0)["document"]["id"]);
 }

+TEST_F(CollectionTest, ArrayStringFieldHighlight) {
+    Collection *coll_array_text;
+
+    std::ifstream infile(std::string(ROOT_DIR) + "test/array_text_documents.jsonl");
+    std::vector<field> fields = {
+            field("title", field_types::STRING, false),
+            field("tags", field_types::STRING_ARRAY, false),
+            field("points", field_types::INT32, false)
+    };
+
+    coll_array_text = collectionManager.get_collection("coll_array_text");
+    if (coll_array_text == nullptr) {
+        coll_array_text = collectionManager.create_collection("coll_array_text", fields, "points").get();
+    }
+
+    std::string json_line;
+
+    while (std::getline(infile, json_line)) {
+        coll_array_text->add(json_line);
+    }
+
+    infile.close();
+
+    query_fields = {"tags"};
+    std::vector<std::string> facets;
+
+    nlohmann::json results = coll_array_text->search("truth about", query_fields, "", facets, sort_fields, 0, 10, 1, FREQUENCY,
+                                                     false, 0).get();
+    ASSERT_EQ(1, results["hits"].size());
+
+    std::vector<std::string> ids = {"0"};
+
+    for (size_t i = 0; i < results["hits"].size(); i++) {
+        nlohmann::json result = results["hits"].at(i);
+        std::string result_id = result["document"]["id"];
+        std::string id = ids.at(i);
+        ASSERT_STREQ(id.c_str(), result_id.c_str());
+    }
+
+    ASSERT_STREQ(results["hits"][0]["highlight"]["tags"].get<std::string>().c_str(), "<mark>truth</mark> <mark>about</mark>");
+
+    results = coll_array_text->search("forever truth", query_fields, "", facets, sort_fields, 0, 10, 1, FREQUENCY,
+                                      false, 0).get();
+    ASSERT_EQ(1, results["hits"].size());
+
+    ids = {"0"};
+
+    for (size_t i = 0; i < results["hits"].size(); i++) {
+        nlohmann::json result = results["hits"].at(i);
+        std::string result_id = result["document"]["id"];
+        std::string id = ids.at(i);
+        ASSERT_STREQ(id.c_str(), result_id.c_str());
+    }
+
+    ASSERT_STREQ(results["hits"][0]["highlight"]["tags"].get<std::string>().c_str(), "the <mark>truth</mark>");
+
+    results = coll_array_text->search("truth", query_fields, "", facets, sort_fields, 0, 10, 1, FREQUENCY,
+                                      false, 0).get();
+    ASSERT_EQ(2, results["hits"].size());
+
+    ids = {"0", "1"};
+
+    for (size_t i = 0; i < results["hits"].size(); i++) {
+        nlohmann::json result = results["hits"].at(i);
+        std::string result_id = result["document"]["id"];
+        std::string id = ids.at(i);
+        ASSERT_STREQ(id.c_str(), result_id.c_str());
+    }
+
+    results = coll_array_text->search("asdadasd", query_fields, "", facets, sort_fields, 0, 10, 1, FREQUENCY,
+                                      false, 0).get();
+    ASSERT_EQ(0, results["hits"].size());
+
+    collectionManager.drop_collection("coll_array_text");
+}
+
 TEST_F(CollectionTest, MultipleFields) {
    Collection *coll_mul_fields;