diff --git a/TODO.md b/TODO.md index b4479eda..ba6b6966 100644 --- a/TODO.md +++ b/TODO.md @@ -100,6 +100,8 @@ - NOT operator support - Log operations - Parameterize replica's MAX_UPDATES_TO_SEND +- NOT operator support +- 64K token limit - > INT32_MAX validation for float field - highlight of string arrays? - test for token ranking on float field diff --git a/include/index.h b/include/index.h index 79ab0ad2..3346cec9 100644 --- a/include/index.h +++ b/include/index.h @@ -84,10 +84,6 @@ private: void do_facets(std::vector & facets, uint32_t* result_ids, size_t results_size); - void populate_token_positions(const std::vector &query_suggestion, - spp::sparse_hash_map &leaf_to_indices, - size_t result_index, std::vector> &token_positions) const; - void search_field(std::string & query, const std::string & field, uint32_t *filter_ids, size_t filter_ids_length, std::vector & facets, const std::vector & sort_fields, const int num_typos, const size_t num_results, @@ -102,6 +98,9 @@ private: Topster<512> & topster, size_t & total_results, uint32_t** all_result_ids, size_t & all_result_ids_len, const size_t & max_results, const bool prefix); + void insert_doc(const uint32_t score, art_tree *t, uint32_t seq_id, + const std::unordered_map> &token_to_offsets) const; + void index_string_field(const std::string & text, const uint32_t score, art_tree *t, uint32_t seq_id, const bool verbatim) const; @@ -147,6 +146,11 @@ public: Option remove(const uint32_t seq_id, nlohmann::json & document); + static void populate_token_positions(const std::vector &query_suggestion, + spp::sparse_hash_map &leaf_to_indices, + size_t result_index, + std::vector>> &array_token_positions); + void score_results(const std::vector & sort_fields, const int & query_index, const uint32_t total_cost, Topster<512> &topster, const std::vector & query_suggestion, const uint32_t *result_ids, const size_t result_size) const; @@ -162,6 +166,8 @@ public: // strings under this length will be fully highlighted, instead of showing a snippet of relevant portion enum {SNIPPET_STR_ABOVE_LEN = 30}; + enum {ARRAY_SEPARATOR = UINT16_MAX}; + // Using a $ prefix so that these meta keys stay above record entries in a lexicographically ordered KV store static constexpr const char* COLLECTION_META_PREFIX = "$CM"; static constexpr const char* COLLECTION_NEXT_SEQ_PREFIX = "$CS"; diff --git a/include/match_score.h b/include/match_score.h index cf901000..29e85329 100644 --- a/include/match_score.h +++ b/include/match_score.h @@ -35,7 +35,7 @@ struct Match { uint16_t start_offset; char offset_diffs[16]; - Match() { + Match(): words_present(0), distance(0), start_offset(0) { } @@ -44,6 +44,14 @@ struct Match { memcpy(offset_diffs, offset_diffs_stacked, 16); } + // Construct a single match score from individual components (for multi-field sort) + inline uint64_t get_match_score(const uint32_t total_cost) const { + uint64_t match_score = ((int64_t)(words_present) << 24) | + ((int64_t)(255 - total_cost) << 16) | + ((int64_t)(distance)); + return match_score; + } + static void print_token_offsets(std::vector> &token_offsets) { for(auto offsets: token_offsets) { for(auto offset: offsets) { @@ -54,7 +62,8 @@ struct Match { } static inline void addTopOfHeapToWindow(TokenOffsetHeap &heap, std::queue &window, - std::vector> &token_offsets, uint16_t *token_offset) { + const std::vector> &token_offsets, + uint16_t *token_offset) { TokenOffset top = heap.top(); heap.pop(); window.push(top); @@ -90,7 +99,7 @@ struct Match { * We use a priority queue to read the offset vectors in a sorted manner, slide a window of a given size, and * compute the max_match and min_displacement of target tokens across the windows. */ - static Match match(uint32_t doc_id, std::vector> &token_offsets) { + static Match match(uint32_t doc_id, const std::vector> &token_offsets) { std::priority_queue, TokenOffset> heap; for(uint8_t token_id=0; token_id < token_offsets.size(); token_id++) { diff --git a/src/collection.cpp b/src/collection.cpp index 5d87140d..f8630eac 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -545,14 +545,9 @@ Option Collection::search(std::string query, const std::vector tokens; - StringUtils::split(document[field_name], tokens, " "); - - // positions in the document of each token in the query - std::vector> token_positions; + if(search_field.type == field_types::STRING || search_field.type == field_types::STRING_ARRAY) { + spp::sparse_hash_map leaf_to_indices; for (const art_leaf *token_leaf : searched_queries[field_order_kv.second.query_index]) { std::vector positions; uint32_t doc_index = token_leaf->values->ids.indexOf(field_order_kv.second.key); @@ -560,20 +555,42 @@ Option Collection::search(std::string query, const std::vectorvalues->offset_index.at(doc_index); - uint32_t end_offset = (doc_index == token_leaf->values->ids.getLength() - 1) ? - token_leaf->values->offsets.getLength() : - token_leaf->values->offset_index.at(doc_index+1); - - while(start_offset < end_offset) { - positions.push_back((uint16_t) token_leaf->values->offsets.at(start_offset)); - start_offset++; - } - - token_positions.push_back(positions); + uint32_t *indices = new uint32_t[1]; + indices[0] = doc_index; + leaf_to_indices.emplace(token_leaf, indices); } - Match match = Match::match(field_order_kv.second.key, token_positions); + // positions in the field of each token in the query + std::vector>> array_token_positions; + Index::populate_token_positions(searched_queries[field_order_kv.second.query_index], + leaf_to_indices, 0, array_token_positions); + + Match match; + uint64_t match_score = 0; + size_t matched_array_index = 0; + + for(size_t array_index = 0; array_index < array_token_positions.size(); array_index++) { + const std::vector> & token_positions = array_token_positions[array_index]; + + if(token_positions.empty()) { + continue; + } + + const Match & this_match = Match::match(field_order_kv.second.key, token_positions); + uint64_t this_match_score = this_match.get_match_score(1); + if(this_match_score > match_score) { + match_score = this_match_score; + match = this_match; + matched_array_index = array_index; + } + } + + std::vector tokens; + if(search_field.type == field_types::STRING) { + StringUtils::split(document[field_name], tokens, " "); + } else { + StringUtils::split(document[field_name][matched_array_index], tokens, " "); + } // unpack `match.offset_diffs` into `token_indices` std::vector token_indices; @@ -609,6 +626,11 @@ Option Collection::search(std::string query, const std::vectorsecond; + it->second = nullptr; + } } result["hits"].push_back(wrapper_doc); diff --git a/src/index.cpp b/src/index.cpp index 2a73a6fe..2cb1efc5 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -225,6 +225,7 @@ void Index::index_string_field(const std::string & text, const uint32_t score, a token_to_offsets[text].push_back(0); } else { StringUtils::split(text, tokens, " "); + for(uint32_t i=0; i> &token_to_offsets) const { for(auto & kv: token_to_offsets) { art_document art_doc; art_doc.id = seq_id; @@ -263,9 +269,33 @@ void Index::index_string_field(const std::string & text, const uint32_t score, a void Index::index_string_array_field(const std::vector & strings, const uint32_t score, art_tree *t, uint32_t seq_id, const bool verbatim) const { - for(const std::string & str: strings) { - index_string_field(str, score, t, seq_id, verbatim); + std::unordered_map>> token_array_positions; + + for(size_t array_index = 0; array_index < strings.size(); array_index++) { + const std::string & str = strings[array_index]; + + std::vector tokens; + StringUtils::split(str, tokens, " "); + + for(uint32_t i=0; i> token_to_offsets; + + for(const auto & kv: token_array_positions) { + for(size_t array_index = 0; array_index < strings.size(); array_index++) { + token_to_offsets[kv.first].insert(token_to_offsets[kv.first].end(), + token_array_positions[kv.first][array_index].begin(), + token_array_positions[kv.first][array_index].end()); + token_to_offsets[kv.first].push_back(ARRAY_SEPARATOR); + } + } + + insert_doc(score, t, seq_id, token_to_offsets); } void Index::index_int32_array_field(const std::vector & values, const uint32_t score, art_tree *t, @@ -850,9 +880,7 @@ void Index::score_results(const std::vector & sort_fields, const int & char empty_offset_diffs[16]; std::fill_n(empty_offset_diffs, 16, 0); Match single_token_match = Match(1, 0, 0, empty_offset_diffs); - const uint64_t single_token_match_score = ((int64_t)(single_token_match.words_present) << 24) | - ((int64_t)(255 - total_cost) << 16) | - ((int64_t)(single_token_match.distance)); + const uint64_t single_token_match_score = single_token_match.get_match_score(total_cost); for(size_t i=0; i & sort_fields, const int & if(query_suggestion.size() == 1) { match_score = single_token_match_score; } else { - std::vector> token_positions; - populate_token_positions(query_suggestion, leaf_to_indices, i, token_positions); - const Match & match = Match::match(seq_id, token_positions); + std::vector>> array_token_positions; + populate_token_positions(query_suggestion, leaf_to_indices, i, array_token_positions); - // Construct a single match score from individual components (for multi-field sort) - match_score = ((int64_t)(match.words_present) << 24) | - ((int64_t)(255 - total_cost) << 16) | - ((int64_t)(match.distance)); + for(const std::vector> & token_positions: array_token_positions) { + if(token_positions.size() == 0) { + continue; + } + const Match & match = Match::match(seq_id, token_positions); + uint64_t this_match_score = match.get_match_score(total_cost); + + if(this_match_score > match_score) { + match_score = this_match_score; + } + + /*std::ostringstream os; + os << name << ", total_cost: " << (255 - total_cost) + << ", words_present: " << match.words_present + << ", match_score: " << match_score + << ", match.distance: " << match.distance + << ", seq_id: " << seq_id << std::endl; + std::cout << os.str();*/ + } } const int64_t default_score = 0; @@ -889,15 +931,6 @@ void Index::score_results(const std::vector & sort_fields, const int & const number_t & primary_rank_value = primary_rank_score * primary_rank_factor; const number_t & secondary_rank_value = secondary_rank_score * secondary_rank_factor; topster.add(seq_id, query_index, match_score, primary_rank_value, secondary_rank_value); - - /* - std::ostringstream os; - os << name << ", total_cost: " << (255 - total_cost) - << ", words_present: " << match.words_present << ", match_score: " << match_score - << ", match.distance: " << match.distance - << ", seq_id: " << seq_id << std::endl; - LOG(INFO) << os.str(); - */ } //long long int timeNanos = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - begin).count(); @@ -910,28 +943,82 @@ void Index::score_results(const std::vector & sort_fields, const int & } void Index::populate_token_positions(const std::vector &query_suggestion, - spp::sparse_hash_map &leaf_to_indices, - size_t result_index, std::vector> &token_positions) const { - // for each token in the query, find the positions that it appears in this document + spp::sparse_hash_map &leaf_to_indices, + size_t result_index, + std::vector>> &array_token_positions) { + + // array_token_positions: + // for every element in a potential array, for every token in query suggestion, get the positions + + // first let's ascertain the size of the array + size_t array_size = 0; + for (const art_leaf *token_leaf : query_suggestion) { - std::vector positions; - uint32_t doc_index = leaf_to_indices.at(token_leaf)[result_index]; - if(doc_index == token_leaf->values->ids.getLength()) { + uint32_t doc_index = leaf_to_indices.at(token_leaf)[result_index]; + if(doc_index == token_leaf->values->ids.getLength()) { + continue; + } + + uint32_t start_offset = token_leaf->values->offset_index.at(doc_index); + uint32_t end_offset = (doc_index == token_leaf->values->ids.getLength() - 1) ? + token_leaf->values->offsets.getLength() : + token_leaf->values->offset_index.at(doc_index+1); + + while(start_offset < end_offset) { + uint16_t pos = (uint16_t) token_leaf->values->offsets.at(start_offset); + if(pos == ARRAY_SEPARATOR) { + array_size++; + } + start_offset++; + } + + if(array_size == 0) { + // for plain string fields that don't use an ARRAY_SEPARATOR + array_size = 1; + } + + break; + } + + // initialize array_token_positions + array_token_positions = std::vector>>(array_size); + + // for each token in the query, find the positions that it appears in the array + for (const art_leaf *token_leaf : query_suggestion) { + uint32_t doc_index = leaf_to_indices.at(token_leaf)[result_index]; + if(doc_index == token_leaf->values->ids.getLength()) { + continue; + } + + uint32_t start_offset = token_leaf->values->offset_index.at(doc_index); + uint32_t end_offset = (doc_index == token_leaf->values->ids.getLength() - 1) ? + token_leaf->values->offsets.getLength() : + token_leaf->values->offset_index.at(doc_index+1); + + size_t array_index = 0; + std::vector positions; + + while(start_offset < end_offset) { + uint16_t pos = (uint16_t) token_leaf->values->offsets.at(start_offset); + start_offset++; + + if(pos == ARRAY_SEPARATOR) { + if(positions.size() != 0) { + array_token_positions[array_index].push_back(positions); + positions.clear(); + } + array_index++; continue; } - uint32_t start_offset = token_leaf->values->offset_index.at(doc_index); - uint32_t end_offset = (doc_index == token_leaf->values->ids.getLength() - 1) ? - token_leaf->values->offsets.getLength() : - token_leaf->values->offset_index.at(doc_index+1); - - while(start_offset < end_offset) { - positions.push_back((uint16_t) token_leaf->values->offsets.at(start_offset)); - start_offset++; - } - - token_positions.push_back(positions); + positions.push_back(pos); } + + if(positions.size() != 0) { + // for plain string fields that don't use an ARRAY_SEPARATOR + array_token_positions[array_index].push_back(positions); + } + } } inline std::vector Index::next_suggestion(const std::vector &token_candidates_vec, diff --git a/test/array_text_documents.jsonl b/test/array_text_documents.jsonl new file mode 100644 index 00000000..8cd27b6a --- /dev/null +++ b/test/array_text_documents.jsonl @@ -0,0 +1,3 @@ +{"title": "The Truth About Forever", "tags": ["the truth", "about forever", "truth about"], "points": 100} +{"title": "Plain Truth", "tags": ["plain", "truth", "plain truth"], "points": 40} +{"title": "Temple of the Winds", "tags": ["temple", "of", "temple of"], "points": 87} \ No newline at end of file diff --git a/test/collection_test.cpp b/test/collection_test.cpp index c503916d..219a4c83 100644 --- a/test/collection_test.cpp +++ b/test/collection_test.cpp @@ -487,6 +487,82 @@ TEST_F(CollectionTest, PrefixSearching) { ASSERT_EQ("16", results["hits"].at(0)["document"]["id"]); } +TEST_F(CollectionTest, ArrayStringFieldHighlight) { + Collection *coll_array_text; + + std::ifstream infile(std::string(ROOT_DIR) + "test/array_text_documents.jsonl"); + std::vector fields = { + field("title", field_types::STRING, false), + field("tags", field_types::STRING_ARRAY, false), + field("points", field_types::INT32, false) + }; + + coll_array_text = collectionManager.get_collection("coll_array_text"); + if (coll_array_text == nullptr) { + coll_array_text = collectionManager.create_collection("coll_array_text", fields, "points").get(); + } + + std::string json_line; + + while (std::getline(infile, json_line)) { + coll_array_text->add(json_line); + } + + infile.close(); + + query_fields = {"tags"}; + std::vector facets; + + nlohmann::json results = coll_array_text->search("truth about", query_fields, "", facets, sort_fields, 0, 10, 1, FREQUENCY, + false, 0).get(); + ASSERT_EQ(1, results["hits"].size()); + + std::vector ids = {"0"}; + + for (size_t i = 0; i < results["hits"].size(); i++) { + nlohmann::json result = results["hits"].at(i); + std::string result_id = result["document"]["id"]; + std::string id = ids.at(i); + ASSERT_STREQ(id.c_str(), result_id.c_str()); + } + + ASSERT_STREQ(results["hits"][0]["highlight"]["tags"].get().c_str(), "truth about"); + + results = coll_array_text->search("forever truth", query_fields, "", facets, sort_fields, 0, 10, 1, FREQUENCY, + false, 0).get(); + ASSERT_EQ(1, results["hits"].size()); + + ids = {"0"}; + + for (size_t i = 0; i < results["hits"].size(); i++) { + nlohmann::json result = results["hits"].at(i); + std::string result_id = result["document"]["id"]; + std::string id = ids.at(i); + ASSERT_STREQ(id.c_str(), result_id.c_str()); + } + + ASSERT_STREQ(results["hits"][0]["highlight"]["tags"].get().c_str(), "the truth"); + + results = coll_array_text->search("truth", query_fields, "", facets, sort_fields, 0, 10, 1, FREQUENCY, + false, 0).get(); + ASSERT_EQ(2, results["hits"].size()); + + ids = {"0", "1"}; + + for (size_t i = 0; i < results["hits"].size(); i++) { + nlohmann::json result = results["hits"].at(i); + std::string result_id = result["document"]["id"]; + std::string id = ids.at(i); + ASSERT_STREQ(id.c_str(), result_id.c_str()); + } + + results = coll_array_text->search("asdadasd", query_fields, "", facets, sort_fields, 0, 10, 1, FREQUENCY, + false, 0).get(); + ASSERT_EQ(0, results["hits"].size()); + + collectionManager.drop_collection("coll_array_text"); +} + TEST_F(CollectionTest, MultipleFields) { Collection *coll_mul_fields;