Merge branch '0.22.0-rc' into postings-refactor-integration

# Conflicts:
#	include/index.h
#	include/posting.h
#	include/posting_list.h
#	src/art.cpp
#	src/collection.cpp
#	src/index.cpp
#	src/posting.cpp
#	src/posting_list.cpp
#	test/art_test.cpp
#	test/collection_specific_test.cpp
#	test/collection_test.cpp
#	test/posting_list_test.cpp
This commit is contained in:
Kishore Nallan 2021-07-24 17:10:54 +05:30
commit 41c16fb7a7
9 changed files with 279 additions and 38 deletions

View File

@ -5,6 +5,11 @@
Typesense is a fast, typo-tolerant search engine for building delightful search experiences.
</p>
<p align="center">
An Open Source Algolia Alternative & <br>
An Easier-to-Use ElasticSearch Alternative
</p>
<p align="center">
<a href="https://circleci.com/gh/typesense/typesense"><img src="https://circleci.com/gh/typesense/typesense.svg?style=shield&circle-token=1addd775339738a3d90869ddd8201110d561feaa"></a>
<a href="https://hub.docker.com/r/typesense/typesense/tags"><img src="https://img.shields.io/docker/pulls/typesense/typesense"></a>
@ -25,7 +30,9 @@
- Search a 32M songs dataset from MusicBrainz: [songs-search.typesense.org](https://songs-search.typesense.org/)
- Search a 28M books dataset from OpenLibrary: [books-search.typesense.org](https://books-search.typesense.org/)
- Search a 2M recipe dataset from RecipeNLG: [recipe-search.typesense.org](https://recipe-search.typesense.org/)
- Search 1M Git commit messages from the Linux Kernel: [linux-commits-search.typesense.org](https://linux-commits-search.typesense.org/)
- Spellchecker with type-ahead, with 333K English words: [spellcheck.typesense.org](https://spellcheck.typesense.org/)
- An E-Commerce Store Browsing experience: [ecommerce-store.typesense.org](https://ecommerce-store.typesense.org/)
🗣️ 🎥 If you prefer watching videos, here's one where we introduce Typesense and show a walk-through: https://youtu.be/F4mB0x_B1AE?t=144
@ -56,11 +63,13 @@
- **Faceting & Filtering:** Drill down and refine results.
- **Grouping & Distinct:** Group similar results together to show more variety.
- **Federated Search:** Search across multiple collections (indices) in a single HTTP request.
- **Geo Search:** Search and sort by results around a geographic location - [in beta](https://github.com/typesense/typesense/issues/78#issuecomment-842308057).
- **Scoped API Keys:** Generate API keys that only allow access to certain records, for multi-tenant applications.
- **Synonyms:** Define words as equivalents of each other, so searching for a word will also return results for the synonyms defined.
- **Curation & Merchandizing:** Boost particular records to a fixed position in the search results, to feature them.
- **Raft-based Clustering:** Setup a distributed cluster that is highly available.
- **Seamless Version Upgrades:** As new versions of Typesense come out, upgrading is as simple as swapping out the binary and restarting Typesense.
- **No Runtime Dependencies:** Typesense is a single binary that you can run locally or in production with a single command.
**Don't see a feature on this list?** Search our issue tracker if someone has already requested it and upvote it, or open a new issue if not. We prioritize our roadmap based on user feedback, so we'd love to hear from you.
@ -102,7 +111,7 @@ Here's a quick example showcasing how you can create a collection, index a docum
Let's begin by starting the Typesense server via Docker:
```
docker run -p 8108:8108 -v/tmp/data:/data typesense/typesense:0.20.0 --data-dir /data --api-key=Hu52dwsas2AdxdE
docker run -p 8108:8108 -v/tmp/data:/data typesense/typesense:0.21.0 --data-dir /data --api-key=Hu52dwsas2AdxdE
```
We have [API Clients](#api-clients) in a couple of languages, but let's use the Python client for this example.
@ -204,7 +213,7 @@ We welcome community contributions to add more official client libraries and int
You can use our [InstantSearch.js adapter](https://github.com/typesense/typesense-instantsearch-adapter)
to quickly build powerful search experiences, complete with filtering, sorting, pagination and more.
Here's how: [https://typesense.org/docs/0.20.0/guide/#search-ui](https://typesense.org/docs/0.20.0/guide/#search-ui)
Here's how: [https://typesense.org/docs/0.21.0/guide/#search-ui](https://typesense.org/docs/0.21.0/guide/#search-ui)
## FAQ

1
SECURITY.md Normal file
View File

@ -0,0 +1 @@
Please report any security vulnerabilities to security@typesense.org.

View File

@ -7,9 +7,7 @@ Please add to this file and send us a PR:
| Name | Description |
| ----------- | ----------- |
| [Recipe Search](https://recipe-search.typesense.org/) | A site that showcases Typesense in action on a 2 Million recipe database, with the ability to filter by ingredients.|
| [Books Search](https://books-search.typesense.org/) | A site that showcases Typesense in action on a 28 Million books database from [OpenLibrary](https://openlibrary.org/), with the ability to filter by authors and subject. |
| [MusicBrainz Songs Search](https://songs-search.typesense.org/) | A site that showcases Typesense in action on a 32 Million Songs database from [MusicBrainz](https://musicbrainz.org/) |
| [2020 US Presidential Candidates' Speeches Search](https://biden-trump-speeches-search.typesense.org/) | Instant Search speeches of US Presidential Candidates side-by-side. |
| [Linux Commit History Search](https://linux-commits-search.typesense.org/) | A site that indexes 1M linux commit messages in Typesense and lets you browse, search and filter through the commits.|
| [Grafikart](https://www.grafikart.fr/) | Learning resources library |
| [New York University Databrary](https://nyu.databrary.org/) | Video documentation library |
| [ElbiseBul](https://www.elbisebul.com/) | E-commerce |
@ -20,3 +18,8 @@ Please add to this file and send us a PR:
| [Have A Class](https://haveaclass.com/) | Find the perfect teacher online |
| [Follow up Boss](https://www.followupboss.com/) | Real-estate CRM software, using Typesense for user notes search. |
| [Jobsort](https://www.jobsort.com/) | Job search engine for developers, by developers (quicksort for tech jobs.) |
| [Books Search](https://books-search.typesense.org/) | A site that showcases Typesense in action on a 28 Million books database from [OpenLibrary](https://openlibrary.org/), with the ability to filter by authors and subject. |
| [MusicBrainz Songs Search](https://songs-search.typesense.org/) | A site that showcases Typesense in action on a 32 Million Songs database from [MusicBrainz](https://musicbrainz.org/) |
| [2020 US Presidential Candidates' Speeches Search](https://biden-trump-speeches-search.typesense.org/) | Instant Search speeches of US Presidential Candidates side-by-side. |
| [E-commerce Store Browsing Experience](https://ecommerce-store.typesense.org/) | A site that showcases how to build an e-commerce storefront browsing experience with Typesense. |
| [Read This Twice](https://www.readthistwice.com/) | Book discovery platform uses typesense to power the book/people search |

View File

@ -36,6 +36,7 @@ struct token_candidates {
struct search_field_t {
std::string name;
size_t priority;
size_t weight;
};

View File

@ -601,21 +601,22 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
// process weights for search fields
std::vector<search_field_t> weighted_search_fields;
size_t max_weight = 100;
if(query_by_weights.empty()) {
max_weight = search_fields.size();
for(size_t i=1; i <= search_fields.size(); i++) {
query_by_weights.push_back(i);
query_by_weights.push_back((max_weight - i) + 1);
}
} else {
auto max_weight = *std::max_element(query_by_weights.begin(), query_by_weights.end());
for(size_t i=0; i < query_by_weights.size(); i++) {
query_by_weights[i] = (max_weight - query_by_weights[i]) + 1;
}
max_weight = *std::max_element(query_by_weights.begin(), query_by_weights.end());
}
for(size_t i=0; i < search_fields.size(); i++) {
const auto& search_field = search_fields[i];
weighted_search_fields.push_back({search_field, query_by_weights[i]});
const auto priority = (max_weight - query_by_weights[i]) + 1;
const auto weight = query_by_weights[i] + 1;
weighted_search_fields.push_back({search_field, priority, weight});
}
std::vector<facet> facets;
@ -1699,7 +1700,9 @@ void Collection::highlight_result(const field &search_field,
if(offset_it != token_offsets.end()) {
if (i == offset_it->first) {
value_stream << highlight_start_tag;
} else if (i == offset_it->second) {
}
if (i == offset_it->second) {
value_stream << text[i];
value_stream << highlight_end_tag;
offset_it++;

View File

@ -908,6 +908,7 @@ void Index::search_candidates(const uint8_t & field_id,
uint32_t total_cost = next_suggestion(token_candidates_vec, n, actual_query_suggestion,
query_suggestion, token_bits);
//LOG(INFO) << "field_num_results: " << field_num_results << ", typo_tokens_threshold: " << typo_tokens_threshold;
/*LOG(INFO) << "n: " << n;
for(size_t i=0; i < actual_query_suggestion.size(); i++) {
LOG(INFO) << "i: " << i << " - " << actual_query_suggestion[i]->key << ", ids: "
@ -1026,11 +1027,6 @@ void Index::search_candidates(const uint8_t & field_id,
delete [] excluded_result_ids;
searched_queries.push_back(actual_query_suggestion);
//LOG(INFO) << "field_num_results: " << field_num_results << ", typo_tokens_threshold: " << typo_tokens_threshold;
if(field_num_results >= typo_tokens_threshold) {
break;
}
}
}
@ -1667,15 +1663,21 @@ void Index::search(const std::vector<query_tokens_t>& field_query_tokens,
uint64_t verbatim_match_fields = 0; // query matching field verbatim
uint64_t exact_match_fields = 0; // number of fields that contains all of query tokens
uint64_t max_weighted_tokens_match = 0; // weighted max number of tokens matched in a field
uint64_t total_token_matches = 0; // total matches across fields (including fuzzy ones)
//LOG(INFO) << "Init pop count: " << __builtin_popcount(token_bits);
for(size_t i = 0; i < num_search_fields; i++) {
const auto field_id = (uint8_t)(FIELD_LIMIT_NUM - i);
size_t weight = search_fields[i].weight;
const size_t priority = search_fields[i].priority;
const size_t weight = search_fields[i].weight;
//LOG(INFO) << "--- field index: " << i << ", weight: " << weight;
//LOG(INFO) << "--- field index: " << i << ", priority: " << priority;
// using `5` here because typo + prefix combo score range is: 0 - 5
// 0 1 2
// 0,1 2,3 4,5
int64_t MAX_SUM_TYPOS = 5 * field_query_tokens[i].q_include_tokens.size();
if(existing_field_kvs.count(field_id) != 0) {
// for existing field, we will simply sum field-wise weighted scores
@ -1686,14 +1688,22 @@ void Index::search(const std::vector<query_tokens_t>& field_query_tokens,
uint64_t tokens_found = ((match_score >> 24) & 0xFF);
uint64_t field_typos = 255 - ((match_score >> 16) & 0xFF);
total_typos += (field_typos + 1) * weight;
total_distances += ((100 - ((match_score >> 8) & 0xFF)) + 1) * weight;
total_typos += (field_typos + 1) * priority;
total_distances += ((100 - ((match_score >> 8) & 0xFF)) + 1) * priority;
verbatim_match_fields += (((match_score & 0xFF)) + 1);
if(field_typos == 0 && tokens_found == field_query_tokens[i].q_include_tokens.size()) {
uint64_t unique_tokens_found =
int64_t(__builtin_popcount(existing_field_kvs[field_id]->token_bits)) - 1;
if(field_typos == 0 && unique_tokens_found == field_query_tokens[i].q_include_tokens.size()) {
exact_match_fields++;
}
auto weighted_tokens_match = (tokens_found * weight) + (MAX_SUM_TYPOS - field_typos + 1);
if(weighted_tokens_match > max_weighted_tokens_match) {
max_weighted_tokens_match = weighted_tokens_match;
}
if(field_typos < min_typos) {
min_typos = field_typos;
}
@ -1701,9 +1711,9 @@ void Index::search(const std::vector<query_tokens_t>& field_query_tokens,
total_token_matches += tokens_found;
/*LOG(INFO) << "seq_id: " << seq_id << ", total_typos: " << (255 - ((match_score >> 8) & 0xFF))
<< ", weighted typos: " << std::max<uint64_t>((255 - ((match_score >> 8) & 0xFF)), 1) * weight
<< ", weighted typos: " << std::max<uint64_t>((255 - ((match_score >> 8) & 0xFF)), 1) * priority
<< ", total dist: " << (((match_score & 0xFF)))
<< ", weighted dist: " << std::max<uint64_t>((100 - (match_score & 0xFF)), 1) * weight;*/
<< ", weighted dist: " << std::max<uint64_t>((100 - (match_score & 0xFF)), 1) * priority;*/
continue;
}
@ -1746,14 +1756,20 @@ void Index::search(const std::vector<query_tokens_t>& field_query_tokens,
uint64_t tokens_found = ((match_score >> 24) & 0xFF);
uint64_t field_typos = 255 - ((match_score >> 16) & 0xFF);
total_distances += ((100 - ((match_score >> 8) & 0xFF)) + 1) * weight;
total_typos += (field_typos + 1) * weight;
total_distances += ((100 - ((match_score >> 8) & 0xFF)) + 1) * priority;
total_typos += (field_typos + 1) * priority;
if(field_typos == 0 && tokens_found == field_query_tokens[i].q_include_tokens.size()) {
exact_match_fields++;
verbatim_match_fields++; // this is only an approximate
}
auto weighted_tokens_match = (tokens_found * weight) + (MAX_SUM_TYPOS - field_typos + 1);
if(weighted_tokens_match > max_weighted_tokens_match) {
max_weighted_tokens_match = weighted_tokens_match;
}
if(field_typos < min_typos) {
min_typos = field_typos;
}
@ -1768,9 +1784,11 @@ void Index::search(const std::vector<query_tokens_t>& field_query_tokens,
total_typos = std::min<uint64_t>(255, total_typos);
total_distances = std::min<uint64_t>(100, total_distances);
max_weighted_tokens_match = std::min<uint64_t>(255, max_weighted_tokens_match);
uint64_t aggregated_score = (
(exact_match_fields << 48) | // number of fields that contain *all tokens* in the query
//(exact_match_fields << 48) | // number of fields that contain *all tokens* in the query
(max_weighted_tokens_match << 48) | // weighted max number of tokens matched in a field
(uniq_tokens_found << 40) | // number of unique tokens found across fields including typos
((255 - min_typos) << 32) | // minimum typo cost across all fields
(total_token_matches << 24) | // total matches across fields including typos
@ -1783,6 +1801,7 @@ void Index::search(const std::vector<query_tokens_t>& field_query_tokens,
/*LOG(INFO) << "seq id: " << seq_id
<< ", exact_match_fields: " << exact_match_fields
<< ", max_weighted_tokens_match: " << max_weighted_tokens_match
<< ", uniq_tokens_found: " << uniq_tokens_found
<< ", min typo score: " << (255 - min_typos)
<< ", total_token_matches: " << total_token_matches
@ -1930,6 +1949,12 @@ void Index::search_field(const uint8_t & field_id,
// when no more costs are left for this token
if(token_to_costs[token_index].empty()) {
// we can try to drop the token and search with remaining tokens
if(field_num_results >= drop_tokens_threshold) {
// but if drop_tokens_threshold is breached, we are done
return ;
}
token_to_costs.erase(token_to_costs.begin()+token_index);
search_tokens.erase(search_tokens.begin()+token_index);
query_tokens.erase(query_tokens.begin()+token_index);
@ -1956,8 +1981,8 @@ void Index::search_field(const uint8_t & field_id,
resume_typo_loop:
if(field_num_results >= drop_tokens_threshold || field_num_results >= typo_tokens_threshold) {
// if either threshold is breached, we are done
if(field_num_results >= typo_tokens_threshold) {
// if typo threshold is breached, we are done
return ;
}
@ -1968,6 +1993,11 @@ void Index::search_field(const uint8_t & field_id,
if(!query_tokens.empty() && num_tokens_dropped < query_tokens.size()) {
// Drop tokens from right until (len/2 + 1), and then from left until (len/2 + 1)
if(field_num_results >= drop_tokens_threshold) {
// if drop_tokens_threshold is breached, we are done
return ;
}
std::vector<token_t> truncated_tokens;
num_tokens_dropped++;

View File

@ -21,7 +21,7 @@ std::string get_query(StringUtils & string_utils, std::string & text) {
for(uint32_t i=0; i<tokens.size(); i++) {
auto token = tokens[i];
string_utils.unicode_normalize(token);
//string_utils.unicode_normalize(token);
normalized_tokens.push_back(token);
}
@ -87,7 +87,7 @@ void benchmark_hn_titles(char* file_path) {
auto begin = std::chrono::high_resolution_clock::now();
for(size_t i = 0; i < queries.size(); i++) {
auto results_op = collection->search(queries[i], search_fields, "", { }, {sort_by("points", "DESC")}, 2, 10, 1, MAX_SCORE, true);
auto results_op = collection->search(queries[i], search_fields, "", { }, {sort_by("points", "DESC")}, {2}, 10, 1, MAX_SCORE, {true});
if(results_op.ok() != true) {
exit(2);
}
@ -152,8 +152,8 @@ void benchmark_reactjs_pages(char* file_path) {
auto begin = std::chrono::high_resolution_clock::now();
for(size_t i = 0; i < queries.size(); i++) {
auto results_op = collection->search(queries[i], search_fields, "", { }, {sort_by("dummy_sorting_field", "DESC")}, 2, 10, 1,
MAX_SCORE, true, 10, spp::sparse_hash_set<std::string>(), {"p"});
auto results_op = collection->search(queries[i], search_fields, "", { }, {sort_by("dummy_sorting_field", "DESC")}, {2}, 10, 1,
MAX_SCORE, {true}, 10, spp::sparse_hash_set<std::string>(), {"p"});
if(results_op.ok() != true) {
exit(2);
}

View File

@ -169,8 +169,6 @@ TEST_F(CollectionSpecificTest, ExactSingleFieldMatch) {
auto results = coll1->search("charger", {"title", "description"}, "", {}, {}, {2}, 10,
1, FREQUENCY, {true, true}).get();
LOG(INFO) << results;
ASSERT_EQ("0", results["hits"][0]["document"]["id"].get<std::string>());
ASSERT_EQ("1", results["hits"][1]["document"]["id"].get<std::string>());
@ -200,11 +198,34 @@ TEST_F(CollectionSpecificTest, OrderMultiFieldFuzzyMatch) {
ASSERT_TRUE(coll1->add(doc2.dump()).ok());
auto results = coll1->search("charger", {"title", "description"}, "", {}, {}, {2}, 10,
1, FREQUENCY, {true, true}).get();
1, FREQUENCY, {true, true},
10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0,
"<mark>", "</mark>", {1, 1}).get();
ASSERT_EQ("1", results["hits"][0]["document"]["id"].get<std::string>());
ASSERT_EQ("0", results["hits"][1]["document"]["id"].get<std::string>());
results = coll1->search("charger", {"title", "description"}, "", {}, {}, {2}, 10,
1, FREQUENCY, {true, true},
10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0,
"<mark>", "</mark>", {2, 1}).get();
ASSERT_EQ("1", results["hits"][0]["document"]["id"].get<std::string>());
ASSERT_EQ("0", results["hits"][1]["document"]["id"].get<std::string>());
// use extreme weights to push title matching ahead
results = coll1->search("charger", {"title", "description"}, "", {}, {}, {2}, 10,
1, FREQUENCY, {true, true},
10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0,
"<mark>", "</mark>", {10, 1}).get();
ASSERT_EQ("0", results["hits"][0]["document"]["id"].get<std::string>());
ASSERT_EQ("1", results["hits"][1]["document"]["id"].get<std::string>());
collectionManager.drop_collection("coll1");
}
@ -242,6 +263,98 @@ TEST_F(CollectionSpecificTest, FieldWeighting) {
collectionManager.drop_collection("coll1");
}
TEST_F(CollectionSpecificTest, MultiFieldArrayRepeatingTokens) {
std::vector<field> fields = {field("title", field_types::STRING, false),
field("description", field_types::STRING, false),
field("attrs", field_types::STRING_ARRAY, false),
field("points", field_types::INT32, false),};
Collection* coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
nlohmann::json doc1;
doc1["id"] = "0";
doc1["title"] = "E182-72/4";
doc1["description"] = "Nexsan Technologies 18 SAN Array - 18 x HDD Supported - 18 x HDD Installed";
doc1["attrs"] = {"Hard Drives Supported > 18", "Hard Drives Installed > 18", "SSD Supported > 18"};
doc1["points"] = 100;
nlohmann::json doc2;
doc2["id"] = "1";
doc2["title"] = "RV345-K9-NA";
doc2["description"] = "Cisco RV345P Router - 18 Ports";
doc2["attrs"] = {"Number of Ports > 18", "Product Type > Router"};
doc2["points"] = 50;
ASSERT_TRUE(coll1->add(doc1.dump()).ok());
ASSERT_TRUE(coll1->add(doc2.dump()).ok());
auto results = coll1->search("rv345 cisco 18", {"title", "description", "attrs"}, "", {}, {}, {1}, 10,
1, FREQUENCY, {true, true, true}).get();
ASSERT_EQ("1", results["hits"][0]["document"]["id"].get<std::string>());
ASSERT_EQ("0", results["hits"][1]["document"]["id"].get<std::string>());
collectionManager.drop_collection("coll1");
}
TEST_F(CollectionSpecificTest, ExactMatchOnPrefix) {
std::vector<field> fields = {field("title", field_types::STRING, false),
field("points", field_types::INT32, false),};
Collection* coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
nlohmann::json doc1;
doc1["id"] = "0";
doc1["title"] = "Yeshivah Gedolah High School";
doc1["points"] = 100;
nlohmann::json doc2;
doc2["id"] = "1";
doc2["title"] = "GED";
doc2["points"] = 50;
ASSERT_TRUE(coll1->add(doc1.dump()).ok());
ASSERT_TRUE(coll1->add(doc2.dump()).ok());
auto results = coll1->search("ged", {"title"}, "", {}, {}, {2}, 10,
1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
"", 1).get();
ASSERT_EQ(2, results["hits"].size());
ASSERT_EQ("1", results["hits"][0]["document"]["id"].get<std::string>());
ASSERT_EQ("0", results["hits"][1]["document"]["id"].get<std::string>());
collectionManager.drop_collection("coll1");
}
TEST_F(CollectionSpecificTest, TypoPrefixSearchWithoutPrefixEnabled) {
std::vector<field> fields = {field("title", field_types::STRING, false),
field("points", field_types::INT32, false),};
Collection* coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
nlohmann::json doc1;
doc1["id"] = "0";
doc1["title"] = "Cisco SG25026HP Gigabit Smart Switch";
doc1["points"] = 100;
ASSERT_TRUE(coll1->add(doc1.dump()).ok());
auto results = coll1->search("SG25026H", {"title"}, "", {}, {}, {2}, 10,
1, FREQUENCY, {false}, 0,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
"", 1).get();
ASSERT_EQ(1, results["hits"].size());
ASSERT_EQ("0", results["hits"][0]["document"]["id"].get<std::string>());
collectionManager.drop_collection("coll1");
}
TEST_F(CollectionSpecificTest, PrefixWithTypos) {
std::vector<field> fields = {field("title", field_types::STRING, false),
field("points", field_types::INT32, false),};
@ -433,3 +546,70 @@ TEST_F(CollectionSpecificTest, DeleteOverridesAndSynonymsOnDiskDuringCollDrop) {
store->scan_fill(Collection::COLLECTION_SYNONYM_PREFIX, stored_values);
ASSERT_TRUE(stored_values.empty());
}
TEST_F(CollectionSpecificTest, SingleCharMatchFullFieldHighlight) {
std::vector<field> fields = {field("title", field_types::STRING, false),
field("points", field_types::INT32, false),};
Collection* coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
nlohmann::json doc1;
doc1["id"] = "0";
doc1["title"] = "Which of the following is a probable sign of infection?";
doc1["points"] = 100;
ASSERT_TRUE(coll1->add(doc1.dump()).ok());
auto results = coll1->search("a 3-month", {"title"}, "", {}, {}, {2}, 10,
1, FREQUENCY, {false}, 1,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
"title", 1).get();
ASSERT_EQ(1, results["hits"].size());
ASSERT_EQ("0", results["hits"][0]["document"]["id"].get<std::string>());
ASSERT_EQ("Which of the following is <mark>a</mark> probable sign of infection?",
results["hits"][0]["highlights"][0]["snippet"].get<std::string>());
ASSERT_EQ("Which of the following is <mark>a</mark> probable sign of infection?",
results["hits"][0]["highlights"][0]["value"].get<std::string>());
collectionManager.drop_collection("coll1");
}
TEST_F(CollectionSpecificTest, TokensSpreadAcrossFields) {
std::vector<field> fields = {field("title", field_types::STRING, false),
field("description", field_types::STRING, false),
field("points", field_types::INT32, false),};
Collection* coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
nlohmann::json doc1;
doc1["id"] = "0";
doc1["title"] = "Foo bar baz";
doc1["description"] = "Share information with this device.";
doc1["points"] = 100;
nlohmann::json doc2;
doc2["id"] = "1";
doc2["title"] = "Foo Random";
doc2["description"] = "The Bar Fox";
doc2["points"] = 250;
ASSERT_TRUE(coll1->add(doc1.dump()).ok());
ASSERT_TRUE(coll1->add(doc2.dump()).ok());
auto results = coll1->search("foo bar", {"title", "description"}, "", {}, {}, {0}, 10,
1, FREQUENCY, {false, false},
10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0,
"<mark>", "</mark>", {4, 1}).get();
LOG(INFO) << results;
ASSERT_EQ("0", results["hits"][0]["document"]["id"].get<std::string>());
ASSERT_EQ("1", results["hits"][1]["document"]["id"].get<std::string>());
collectionManager.drop_collection("coll1");
}

View File

@ -292,7 +292,7 @@ TEST_F(CollectionTest, SkipUnindexedTokensDuringPhraseSearch) {
}
results.clear();
results = collection->search("the a DoesNotExist", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, {false}, 0).get();
results = collection->search("the a insurance", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, {false}, 0).get();
ASSERT_EQ(0, results["hits"].size());
// with no indexed word
@ -859,7 +859,10 @@ TEST_F(CollectionTest, MultipleFields) {
// when "starring" takes higher priority than "title"
query_fields = {"starring", "title"};
results = coll_mul_fields->search("thomas", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, {false}).get();
results = coll_mul_fields->search("thomas", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, {false},
10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0,
"<mark>", "</mark>", {2, 1}).get();
ASSERT_EQ(4, results["hits"].size());
ids = {"15", "12", "13", "14"};
@ -2969,6 +2972,17 @@ TEST_F(CollectionTest, MultiFieldRelevance2) {
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0,
"<mark>", "</mark>", {1, 4}).get();
ASSERT_STREQ("0", results["hits"][0]["document"]["id"].get<std::string>().c_str());
ASSERT_STREQ("1", results["hits"][1]["document"]["id"].get<std::string>().c_str());
// use same weights
results = coll1->search("on a jetplane",
{"title", "artist"}, "", {}, {}, {0}, 10, 1, FREQUENCY,
{true}, 10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0,
"<mark>", "</mark>", {1, 1}).get();
ASSERT_STREQ("1", results["hits"][0]["document"]["id"].get<std::string>().c_str());
ASSERT_STREQ("0", results["hits"][1]["document"]["id"].get<std::string>().c_str());