mirror of
https://github.com/typesense/typesense.git
synced 2025-05-20 21:52:23 +08:00
Merge branch '0.22.0-rc' into postings-refactor-integration
# Conflicts: # include/index.h # include/posting.h # include/posting_list.h # src/art.cpp # src/collection.cpp # src/index.cpp # src/posting.cpp # src/posting_list.cpp # test/art_test.cpp # test/collection_specific_test.cpp # test/collection_test.cpp # test/posting_list_test.cpp
This commit is contained in:
commit
41c16fb7a7
13
README.md
13
README.md
@ -5,6 +5,11 @@
|
||||
Typesense is a fast, typo-tolerant search engine for building delightful search experiences.
|
||||
</p>
|
||||
|
||||
<p align="center">
|
||||
An Open Source Algolia Alternative & <br>
|
||||
An Easier-to-Use ElasticSearch Alternative
|
||||
</p>
|
||||
|
||||
<p align="center">
|
||||
<a href="https://circleci.com/gh/typesense/typesense"><img src="https://circleci.com/gh/typesense/typesense.svg?style=shield&circle-token=1addd775339738a3d90869ddd8201110d561feaa"></a>
|
||||
<a href="https://hub.docker.com/r/typesense/typesense/tags"><img src="https://img.shields.io/docker/pulls/typesense/typesense"></a>
|
||||
@ -25,7 +30,9 @@
|
||||
- Search a 32M songs dataset from MusicBrainz: [songs-search.typesense.org](https://songs-search.typesense.org/)
|
||||
- Search a 28M books dataset from OpenLibrary: [books-search.typesense.org](https://books-search.typesense.org/)
|
||||
- Search a 2M recipe dataset from RecipeNLG: [recipe-search.typesense.org](https://recipe-search.typesense.org/)
|
||||
- Search 1M Git commit messages from the Linux Kernel: [linux-commits-search.typesense.org](https://linux-commits-search.typesense.org/)
|
||||
- Spellchecker with type-ahead, with 333K English words: [spellcheck.typesense.org](https://spellcheck.typesense.org/)
|
||||
- An E-Commerce Store Browsing experience: [ecommerce-store.typesense.org](https://ecommerce-store.typesense.org/)
|
||||
|
||||
🗣️ 🎥 If you prefer watching videos, here's one where we introduce Typesense and show a walk-through: https://youtu.be/F4mB0x_B1AE?t=144
|
||||
|
||||
@ -56,11 +63,13 @@
|
||||
- **Faceting & Filtering:** Drill down and refine results.
|
||||
- **Grouping & Distinct:** Group similar results together to show more variety.
|
||||
- **Federated Search:** Search across multiple collections (indices) in a single HTTP request.
|
||||
- **Geo Search:** Search and sort by results around a geographic location - [in beta](https://github.com/typesense/typesense/issues/78#issuecomment-842308057).
|
||||
- **Scoped API Keys:** Generate API keys that only allow access to certain records, for multi-tenant applications.
|
||||
- **Synonyms:** Define words as equivalents of each other, so searching for a word will also return results for the synonyms defined.
|
||||
- **Curation & Merchandizing:** Boost particular records to a fixed position in the search results, to feature them.
|
||||
- **Raft-based Clustering:** Setup a distributed cluster that is highly available.
|
||||
- **Seamless Version Upgrades:** As new versions of Typesense come out, upgrading is as simple as swapping out the binary and restarting Typesense.
|
||||
- **No Runtime Dependencies:** Typesense is a single binary that you can run locally or in production with a single command.
|
||||
|
||||
**Don't see a feature on this list?** Search our issue tracker if someone has already requested it and upvote it, or open a new issue if not. We prioritize our roadmap based on user feedback, so we'd love to hear from you.
|
||||
|
||||
@ -102,7 +111,7 @@ Here's a quick example showcasing how you can create a collection, index a docum
|
||||
Let's begin by starting the Typesense server via Docker:
|
||||
|
||||
```
|
||||
docker run -p 8108:8108 -v/tmp/data:/data typesense/typesense:0.20.0 --data-dir /data --api-key=Hu52dwsas2AdxdE
|
||||
docker run -p 8108:8108 -v/tmp/data:/data typesense/typesense:0.21.0 --data-dir /data --api-key=Hu52dwsas2AdxdE
|
||||
```
|
||||
|
||||
We have [API Clients](#api-clients) in a couple of languages, but let's use the Python client for this example.
|
||||
@ -204,7 +213,7 @@ We welcome community contributions to add more official client libraries and int
|
||||
You can use our [InstantSearch.js adapter](https://github.com/typesense/typesense-instantsearch-adapter)
|
||||
to quickly build powerful search experiences, complete with filtering, sorting, pagination and more.
|
||||
|
||||
Here's how: [https://typesense.org/docs/0.20.0/guide/#search-ui](https://typesense.org/docs/0.20.0/guide/#search-ui)
|
||||
Here's how: [https://typesense.org/docs/0.21.0/guide/#search-ui](https://typesense.org/docs/0.21.0/guide/#search-ui)
|
||||
|
||||
## FAQ
|
||||
|
||||
|
1
SECURITY.md
Normal file
1
SECURITY.md
Normal file
@ -0,0 +1 @@
|
||||
Please report any security vulnerabilities to security@typesense.org.
|
@ -7,9 +7,7 @@ Please add to this file and send us a PR:
|
||||
| Name | Description |
|
||||
| ----------- | ----------- |
|
||||
| [Recipe Search](https://recipe-search.typesense.org/) | A site that showcases Typesense in action on a 2 Million recipe database, with the ability to filter by ingredients.|
|
||||
| [Books Search](https://books-search.typesense.org/) | A site that showcases Typesense in action on a 28 Million books database from [OpenLibrary](https://openlibrary.org/), with the ability to filter by authors and subject. |
|
||||
| [MusicBrainz Songs Search](https://songs-search.typesense.org/) | A site that showcases Typesense in action on a 32 Million Songs database from [MusicBrainz](https://musicbrainz.org/) |
|
||||
| [2020 US Presidential Candidates' Speeches Search](https://biden-trump-speeches-search.typesense.org/) | Instant Search speeches of US Presidential Candidates side-by-side. |
|
||||
| [Linux Commit History Search](https://linux-commits-search.typesense.org/) | A site that indexes 1M linux commit messages in Typesense and lets you browse, search and filter through the commits.|
|
||||
| [Grafikart](https://www.grafikart.fr/) | Learning resources library |
|
||||
| [New York University Databrary](https://nyu.databrary.org/) | Video documentation library |
|
||||
| [ElbiseBul](https://www.elbisebul.com/) | E-commerce |
|
||||
@ -20,3 +18,8 @@ Please add to this file and send us a PR:
|
||||
| [Have A Class](https://haveaclass.com/) | Find the perfect teacher online |
|
||||
| [Follow up Boss](https://www.followupboss.com/) | Real-estate CRM software, using Typesense for user notes search. |
|
||||
| [Jobsort](https://www.jobsort.com/) | Job search engine for developers, by developers (quicksort for tech jobs.) |
|
||||
| [Books Search](https://books-search.typesense.org/) | A site that showcases Typesense in action on a 28 Million books database from [OpenLibrary](https://openlibrary.org/), with the ability to filter by authors and subject. |
|
||||
| [MusicBrainz Songs Search](https://songs-search.typesense.org/) | A site that showcases Typesense in action on a 32 Million Songs database from [MusicBrainz](https://musicbrainz.org/) |
|
||||
| [2020 US Presidential Candidates' Speeches Search](https://biden-trump-speeches-search.typesense.org/) | Instant Search speeches of US Presidential Candidates side-by-side. |
|
||||
| [E-commerce Store Browsing Experience](https://ecommerce-store.typesense.org/) | A site that showcases how to build an e-commerce storefront browsing experience with Typesense. |
|
||||
| [Read This Twice](https://www.readthistwice.com/) | Book discovery platform uses typesense to power the book/people search |
|
||||
|
@ -36,6 +36,7 @@ struct token_candidates {
|
||||
|
||||
struct search_field_t {
|
||||
std::string name;
|
||||
size_t priority;
|
||||
size_t weight;
|
||||
};
|
||||
|
||||
|
@ -601,21 +601,22 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
|
||||
|
||||
// process weights for search fields
|
||||
std::vector<search_field_t> weighted_search_fields;
|
||||
size_t max_weight = 100;
|
||||
|
||||
if(query_by_weights.empty()) {
|
||||
max_weight = search_fields.size();
|
||||
for(size_t i=1; i <= search_fields.size(); i++) {
|
||||
query_by_weights.push_back(i);
|
||||
query_by_weights.push_back((max_weight - i) + 1);
|
||||
}
|
||||
} else {
|
||||
auto max_weight = *std::max_element(query_by_weights.begin(), query_by_weights.end());
|
||||
for(size_t i=0; i < query_by_weights.size(); i++) {
|
||||
query_by_weights[i] = (max_weight - query_by_weights[i]) + 1;
|
||||
}
|
||||
max_weight = *std::max_element(query_by_weights.begin(), query_by_weights.end());
|
||||
}
|
||||
|
||||
for(size_t i=0; i < search_fields.size(); i++) {
|
||||
const auto& search_field = search_fields[i];
|
||||
weighted_search_fields.push_back({search_field, query_by_weights[i]});
|
||||
const auto priority = (max_weight - query_by_weights[i]) + 1;
|
||||
const auto weight = query_by_weights[i] + 1;
|
||||
weighted_search_fields.push_back({search_field, priority, weight});
|
||||
}
|
||||
|
||||
std::vector<facet> facets;
|
||||
@ -1699,7 +1700,9 @@ void Collection::highlight_result(const field &search_field,
|
||||
if(offset_it != token_offsets.end()) {
|
||||
if (i == offset_it->first) {
|
||||
value_stream << highlight_start_tag;
|
||||
} else if (i == offset_it->second) {
|
||||
}
|
||||
|
||||
if (i == offset_it->second) {
|
||||
value_stream << text[i];
|
||||
value_stream << highlight_end_tag;
|
||||
offset_it++;
|
||||
|
@ -908,6 +908,7 @@ void Index::search_candidates(const uint8_t & field_id,
|
||||
uint32_t total_cost = next_suggestion(token_candidates_vec, n, actual_query_suggestion,
|
||||
query_suggestion, token_bits);
|
||||
|
||||
//LOG(INFO) << "field_num_results: " << field_num_results << ", typo_tokens_threshold: " << typo_tokens_threshold;
|
||||
/*LOG(INFO) << "n: " << n;
|
||||
for(size_t i=0; i < actual_query_suggestion.size(); i++) {
|
||||
LOG(INFO) << "i: " << i << " - " << actual_query_suggestion[i]->key << ", ids: "
|
||||
@ -1026,11 +1027,6 @@ void Index::search_candidates(const uint8_t & field_id,
|
||||
delete [] excluded_result_ids;
|
||||
|
||||
searched_queries.push_back(actual_query_suggestion);
|
||||
|
||||
//LOG(INFO) << "field_num_results: " << field_num_results << ", typo_tokens_threshold: " << typo_tokens_threshold;
|
||||
if(field_num_results >= typo_tokens_threshold) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -1667,15 +1663,21 @@ void Index::search(const std::vector<query_tokens_t>& field_query_tokens,
|
||||
|
||||
uint64_t verbatim_match_fields = 0; // query matching field verbatim
|
||||
uint64_t exact_match_fields = 0; // number of fields that contains all of query tokens
|
||||
uint64_t max_weighted_tokens_match = 0; // weighted max number of tokens matched in a field
|
||||
uint64_t total_token_matches = 0; // total matches across fields (including fuzzy ones)
|
||||
|
||||
//LOG(INFO) << "Init pop count: " << __builtin_popcount(token_bits);
|
||||
|
||||
for(size_t i = 0; i < num_search_fields; i++) {
|
||||
const auto field_id = (uint8_t)(FIELD_LIMIT_NUM - i);
|
||||
size_t weight = search_fields[i].weight;
|
||||
const size_t priority = search_fields[i].priority;
|
||||
const size_t weight = search_fields[i].weight;
|
||||
|
||||
//LOG(INFO) << "--- field index: " << i << ", weight: " << weight;
|
||||
//LOG(INFO) << "--- field index: " << i << ", priority: " << priority;
|
||||
// using `5` here because typo + prefix combo score range is: 0 - 5
|
||||
// 0 1 2
|
||||
// 0,1 2,3 4,5
|
||||
int64_t MAX_SUM_TYPOS = 5 * field_query_tokens[i].q_include_tokens.size();
|
||||
|
||||
if(existing_field_kvs.count(field_id) != 0) {
|
||||
// for existing field, we will simply sum field-wise weighted scores
|
||||
@ -1686,14 +1688,22 @@ void Index::search(const std::vector<query_tokens_t>& field_query_tokens,
|
||||
|
||||
uint64_t tokens_found = ((match_score >> 24) & 0xFF);
|
||||
uint64_t field_typos = 255 - ((match_score >> 16) & 0xFF);
|
||||
total_typos += (field_typos + 1) * weight;
|
||||
total_distances += ((100 - ((match_score >> 8) & 0xFF)) + 1) * weight;
|
||||
total_typos += (field_typos + 1) * priority;
|
||||
total_distances += ((100 - ((match_score >> 8) & 0xFF)) + 1) * priority;
|
||||
verbatim_match_fields += (((match_score & 0xFF)) + 1);
|
||||
|
||||
if(field_typos == 0 && tokens_found == field_query_tokens[i].q_include_tokens.size()) {
|
||||
uint64_t unique_tokens_found =
|
||||
int64_t(__builtin_popcount(existing_field_kvs[field_id]->token_bits)) - 1;
|
||||
|
||||
if(field_typos == 0 && unique_tokens_found == field_query_tokens[i].q_include_tokens.size()) {
|
||||
exact_match_fields++;
|
||||
}
|
||||
|
||||
auto weighted_tokens_match = (tokens_found * weight) + (MAX_SUM_TYPOS - field_typos + 1);
|
||||
if(weighted_tokens_match > max_weighted_tokens_match) {
|
||||
max_weighted_tokens_match = weighted_tokens_match;
|
||||
}
|
||||
|
||||
if(field_typos < min_typos) {
|
||||
min_typos = field_typos;
|
||||
}
|
||||
@ -1701,9 +1711,9 @@ void Index::search(const std::vector<query_tokens_t>& field_query_tokens,
|
||||
total_token_matches += tokens_found;
|
||||
|
||||
/*LOG(INFO) << "seq_id: " << seq_id << ", total_typos: " << (255 - ((match_score >> 8) & 0xFF))
|
||||
<< ", weighted typos: " << std::max<uint64_t>((255 - ((match_score >> 8) & 0xFF)), 1) * weight
|
||||
<< ", weighted typos: " << std::max<uint64_t>((255 - ((match_score >> 8) & 0xFF)), 1) * priority
|
||||
<< ", total dist: " << (((match_score & 0xFF)))
|
||||
<< ", weighted dist: " << std::max<uint64_t>((100 - (match_score & 0xFF)), 1) * weight;*/
|
||||
<< ", weighted dist: " << std::max<uint64_t>((100 - (match_score & 0xFF)), 1) * priority;*/
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -1746,14 +1756,20 @@ void Index::search(const std::vector<query_tokens_t>& field_query_tokens,
|
||||
|
||||
uint64_t tokens_found = ((match_score >> 24) & 0xFF);
|
||||
uint64_t field_typos = 255 - ((match_score >> 16) & 0xFF);
|
||||
total_distances += ((100 - ((match_score >> 8) & 0xFF)) + 1) * weight;
|
||||
total_typos += (field_typos + 1) * weight;
|
||||
total_distances += ((100 - ((match_score >> 8) & 0xFF)) + 1) * priority;
|
||||
total_typos += (field_typos + 1) * priority;
|
||||
|
||||
if(field_typos == 0 && tokens_found == field_query_tokens[i].q_include_tokens.size()) {
|
||||
exact_match_fields++;
|
||||
verbatim_match_fields++; // this is only an approximate
|
||||
}
|
||||
|
||||
auto weighted_tokens_match = (tokens_found * weight) + (MAX_SUM_TYPOS - field_typos + 1);
|
||||
|
||||
if(weighted_tokens_match > max_weighted_tokens_match) {
|
||||
max_weighted_tokens_match = weighted_tokens_match;
|
||||
}
|
||||
|
||||
if(field_typos < min_typos) {
|
||||
min_typos = field_typos;
|
||||
}
|
||||
@ -1768,9 +1784,11 @@ void Index::search(const std::vector<query_tokens_t>& field_query_tokens,
|
||||
|
||||
total_typos = std::min<uint64_t>(255, total_typos);
|
||||
total_distances = std::min<uint64_t>(100, total_distances);
|
||||
max_weighted_tokens_match = std::min<uint64_t>(255, max_weighted_tokens_match);
|
||||
|
||||
uint64_t aggregated_score = (
|
||||
(exact_match_fields << 48) | // number of fields that contain *all tokens* in the query
|
||||
//(exact_match_fields << 48) | // number of fields that contain *all tokens* in the query
|
||||
(max_weighted_tokens_match << 48) | // weighted max number of tokens matched in a field
|
||||
(uniq_tokens_found << 40) | // number of unique tokens found across fields including typos
|
||||
((255 - min_typos) << 32) | // minimum typo cost across all fields
|
||||
(total_token_matches << 24) | // total matches across fields including typos
|
||||
@ -1783,6 +1801,7 @@ void Index::search(const std::vector<query_tokens_t>& field_query_tokens,
|
||||
|
||||
/*LOG(INFO) << "seq id: " << seq_id
|
||||
<< ", exact_match_fields: " << exact_match_fields
|
||||
<< ", max_weighted_tokens_match: " << max_weighted_tokens_match
|
||||
<< ", uniq_tokens_found: " << uniq_tokens_found
|
||||
<< ", min typo score: " << (255 - min_typos)
|
||||
<< ", total_token_matches: " << total_token_matches
|
||||
@ -1930,6 +1949,12 @@ void Index::search_field(const uint8_t & field_id,
|
||||
// when no more costs are left for this token
|
||||
if(token_to_costs[token_index].empty()) {
|
||||
// we can try to drop the token and search with remaining tokens
|
||||
|
||||
if(field_num_results >= drop_tokens_threshold) {
|
||||
// but if drop_tokens_threshold is breached, we are done
|
||||
return ;
|
||||
}
|
||||
|
||||
token_to_costs.erase(token_to_costs.begin()+token_index);
|
||||
search_tokens.erase(search_tokens.begin()+token_index);
|
||||
query_tokens.erase(query_tokens.begin()+token_index);
|
||||
@ -1956,8 +1981,8 @@ void Index::search_field(const uint8_t & field_id,
|
||||
|
||||
resume_typo_loop:
|
||||
|
||||
if(field_num_results >= drop_tokens_threshold || field_num_results >= typo_tokens_threshold) {
|
||||
// if either threshold is breached, we are done
|
||||
if(field_num_results >= typo_tokens_threshold) {
|
||||
// if typo threshold is breached, we are done
|
||||
return ;
|
||||
}
|
||||
|
||||
@ -1968,6 +1993,11 @@ void Index::search_field(const uint8_t & field_id,
|
||||
if(!query_tokens.empty() && num_tokens_dropped < query_tokens.size()) {
|
||||
// Drop tokens from right until (len/2 + 1), and then from left until (len/2 + 1)
|
||||
|
||||
if(field_num_results >= drop_tokens_threshold) {
|
||||
// if drop_tokens_threshold is breached, we are done
|
||||
return ;
|
||||
}
|
||||
|
||||
std::vector<token_t> truncated_tokens;
|
||||
num_tokens_dropped++;
|
||||
|
||||
|
@ -21,7 +21,7 @@ std::string get_query(StringUtils & string_utils, std::string & text) {
|
||||
|
||||
for(uint32_t i=0; i<tokens.size(); i++) {
|
||||
auto token = tokens[i];
|
||||
string_utils.unicode_normalize(token);
|
||||
//string_utils.unicode_normalize(token);
|
||||
normalized_tokens.push_back(token);
|
||||
}
|
||||
|
||||
@ -87,7 +87,7 @@ void benchmark_hn_titles(char* file_path) {
|
||||
auto begin = std::chrono::high_resolution_clock::now();
|
||||
|
||||
for(size_t i = 0; i < queries.size(); i++) {
|
||||
auto results_op = collection->search(queries[i], search_fields, "", { }, {sort_by("points", "DESC")}, 2, 10, 1, MAX_SCORE, true);
|
||||
auto results_op = collection->search(queries[i], search_fields, "", { }, {sort_by("points", "DESC")}, {2}, 10, 1, MAX_SCORE, {true});
|
||||
if(results_op.ok() != true) {
|
||||
exit(2);
|
||||
}
|
||||
@ -152,8 +152,8 @@ void benchmark_reactjs_pages(char* file_path) {
|
||||
auto begin = std::chrono::high_resolution_clock::now();
|
||||
|
||||
for(size_t i = 0; i < queries.size(); i++) {
|
||||
auto results_op = collection->search(queries[i], search_fields, "", { }, {sort_by("dummy_sorting_field", "DESC")}, 2, 10, 1,
|
||||
MAX_SCORE, true, 10, spp::sparse_hash_set<std::string>(), {"p"});
|
||||
auto results_op = collection->search(queries[i], search_fields, "", { }, {sort_by("dummy_sorting_field", "DESC")}, {2}, 10, 1,
|
||||
MAX_SCORE, {true}, 10, spp::sparse_hash_set<std::string>(), {"p"});
|
||||
if(results_op.ok() != true) {
|
||||
exit(2);
|
||||
}
|
||||
|
@ -169,8 +169,6 @@ TEST_F(CollectionSpecificTest, ExactSingleFieldMatch) {
|
||||
auto results = coll1->search("charger", {"title", "description"}, "", {}, {}, {2}, 10,
|
||||
1, FREQUENCY, {true, true}).get();
|
||||
|
||||
LOG(INFO) << results;
|
||||
|
||||
ASSERT_EQ("0", results["hits"][0]["document"]["id"].get<std::string>());
|
||||
ASSERT_EQ("1", results["hits"][1]["document"]["id"].get<std::string>());
|
||||
|
||||
@ -200,11 +198,34 @@ TEST_F(CollectionSpecificTest, OrderMultiFieldFuzzyMatch) {
|
||||
ASSERT_TRUE(coll1->add(doc2.dump()).ok());
|
||||
|
||||
auto results = coll1->search("charger", {"title", "description"}, "", {}, {}, {2}, 10,
|
||||
1, FREQUENCY, {true, true}).get();
|
||||
1, FREQUENCY, {true, true},
|
||||
10, spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0,
|
||||
"<mark>", "</mark>", {1, 1}).get();
|
||||
|
||||
ASSERT_EQ("1", results["hits"][0]["document"]["id"].get<std::string>());
|
||||
ASSERT_EQ("0", results["hits"][1]["document"]["id"].get<std::string>());
|
||||
|
||||
results = coll1->search("charger", {"title", "description"}, "", {}, {}, {2}, 10,
|
||||
1, FREQUENCY, {true, true},
|
||||
10, spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0,
|
||||
"<mark>", "</mark>", {2, 1}).get();
|
||||
|
||||
ASSERT_EQ("1", results["hits"][0]["document"]["id"].get<std::string>());
|
||||
ASSERT_EQ("0", results["hits"][1]["document"]["id"].get<std::string>());
|
||||
|
||||
// use extreme weights to push title matching ahead
|
||||
|
||||
results = coll1->search("charger", {"title", "description"}, "", {}, {}, {2}, 10,
|
||||
1, FREQUENCY, {true, true},
|
||||
10, spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0,
|
||||
"<mark>", "</mark>", {10, 1}).get();
|
||||
|
||||
ASSERT_EQ("0", results["hits"][0]["document"]["id"].get<std::string>());
|
||||
ASSERT_EQ("1", results["hits"][1]["document"]["id"].get<std::string>());
|
||||
|
||||
collectionManager.drop_collection("coll1");
|
||||
}
|
||||
|
||||
@ -242,6 +263,98 @@ TEST_F(CollectionSpecificTest, FieldWeighting) {
|
||||
collectionManager.drop_collection("coll1");
|
||||
}
|
||||
|
||||
TEST_F(CollectionSpecificTest, MultiFieldArrayRepeatingTokens) {
|
||||
std::vector<field> fields = {field("title", field_types::STRING, false),
|
||||
field("description", field_types::STRING, false),
|
||||
field("attrs", field_types::STRING_ARRAY, false),
|
||||
field("points", field_types::INT32, false),};
|
||||
|
||||
Collection* coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
|
||||
|
||||
nlohmann::json doc1;
|
||||
doc1["id"] = "0";
|
||||
doc1["title"] = "E182-72/4";
|
||||
doc1["description"] = "Nexsan Technologies 18 SAN Array - 18 x HDD Supported - 18 x HDD Installed";
|
||||
doc1["attrs"] = {"Hard Drives Supported > 18", "Hard Drives Installed > 18", "SSD Supported > 18"};
|
||||
doc1["points"] = 100;
|
||||
|
||||
nlohmann::json doc2;
|
||||
doc2["id"] = "1";
|
||||
doc2["title"] = "RV345-K9-NA";
|
||||
doc2["description"] = "Cisco RV345P Router - 18 Ports";
|
||||
doc2["attrs"] = {"Number of Ports > 18", "Product Type > Router"};
|
||||
doc2["points"] = 50;
|
||||
|
||||
ASSERT_TRUE(coll1->add(doc1.dump()).ok());
|
||||
ASSERT_TRUE(coll1->add(doc2.dump()).ok());
|
||||
|
||||
auto results = coll1->search("rv345 cisco 18", {"title", "description", "attrs"}, "", {}, {}, {1}, 10,
|
||||
1, FREQUENCY, {true, true, true}).get();
|
||||
|
||||
ASSERT_EQ("1", results["hits"][0]["document"]["id"].get<std::string>());
|
||||
ASSERT_EQ("0", results["hits"][1]["document"]["id"].get<std::string>());
|
||||
|
||||
collectionManager.drop_collection("coll1");
|
||||
}
|
||||
|
||||
TEST_F(CollectionSpecificTest, ExactMatchOnPrefix) {
|
||||
std::vector<field> fields = {field("title", field_types::STRING, false),
|
||||
field("points", field_types::INT32, false),};
|
||||
|
||||
Collection* coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
|
||||
|
||||
nlohmann::json doc1;
|
||||
doc1["id"] = "0";
|
||||
doc1["title"] = "Yeshivah Gedolah High School";
|
||||
doc1["points"] = 100;
|
||||
|
||||
nlohmann::json doc2;
|
||||
doc2["id"] = "1";
|
||||
doc2["title"] = "GED";
|
||||
doc2["points"] = 50;
|
||||
|
||||
ASSERT_TRUE(coll1->add(doc1.dump()).ok());
|
||||
ASSERT_TRUE(coll1->add(doc2.dump()).ok());
|
||||
|
||||
auto results = coll1->search("ged", {"title"}, "", {}, {}, {2}, 10,
|
||||
1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
|
||||
"", 1).get();
|
||||
|
||||
ASSERT_EQ(2, results["hits"].size());
|
||||
|
||||
ASSERT_EQ("1", results["hits"][0]["document"]["id"].get<std::string>());
|
||||
ASSERT_EQ("0", results["hits"][1]["document"]["id"].get<std::string>());
|
||||
|
||||
collectionManager.drop_collection("coll1");
|
||||
}
|
||||
|
||||
TEST_F(CollectionSpecificTest, TypoPrefixSearchWithoutPrefixEnabled) {
|
||||
std::vector<field> fields = {field("title", field_types::STRING, false),
|
||||
field("points", field_types::INT32, false),};
|
||||
|
||||
Collection* coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
|
||||
|
||||
nlohmann::json doc1;
|
||||
doc1["id"] = "0";
|
||||
doc1["title"] = "Cisco SG25026HP Gigabit Smart Switch";
|
||||
doc1["points"] = 100;
|
||||
|
||||
ASSERT_TRUE(coll1->add(doc1.dump()).ok());
|
||||
|
||||
auto results = coll1->search("SG25026H", {"title"}, "", {}, {}, {2}, 10,
|
||||
1, FREQUENCY, {false}, 0,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
|
||||
"", 1).get();
|
||||
|
||||
ASSERT_EQ(1, results["hits"].size());
|
||||
ASSERT_EQ("0", results["hits"][0]["document"]["id"].get<std::string>());
|
||||
|
||||
collectionManager.drop_collection("coll1");
|
||||
}
|
||||
|
||||
TEST_F(CollectionSpecificTest, PrefixWithTypos) {
|
||||
std::vector<field> fields = {field("title", field_types::STRING, false),
|
||||
field("points", field_types::INT32, false),};
|
||||
@ -433,3 +546,70 @@ TEST_F(CollectionSpecificTest, DeleteOverridesAndSynonymsOnDiskDuringCollDrop) {
|
||||
store->scan_fill(Collection::COLLECTION_SYNONYM_PREFIX, stored_values);
|
||||
ASSERT_TRUE(stored_values.empty());
|
||||
}
|
||||
|
||||
TEST_F(CollectionSpecificTest, SingleCharMatchFullFieldHighlight) {
|
||||
std::vector<field> fields = {field("title", field_types::STRING, false),
|
||||
field("points", field_types::INT32, false),};
|
||||
|
||||
Collection* coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
|
||||
|
||||
nlohmann::json doc1;
|
||||
doc1["id"] = "0";
|
||||
doc1["title"] = "Which of the following is a probable sign of infection?";
|
||||
doc1["points"] = 100;
|
||||
|
||||
ASSERT_TRUE(coll1->add(doc1.dump()).ok());
|
||||
|
||||
auto results = coll1->search("a 3-month", {"title"}, "", {}, {}, {2}, 10,
|
||||
1, FREQUENCY, {false}, 1,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
|
||||
"title", 1).get();
|
||||
|
||||
ASSERT_EQ(1, results["hits"].size());
|
||||
ASSERT_EQ("0", results["hits"][0]["document"]["id"].get<std::string>());
|
||||
|
||||
ASSERT_EQ("Which of the following is <mark>a</mark> probable sign of infection?",
|
||||
results["hits"][0]["highlights"][0]["snippet"].get<std::string>());
|
||||
|
||||
ASSERT_EQ("Which of the following is <mark>a</mark> probable sign of infection?",
|
||||
results["hits"][0]["highlights"][0]["value"].get<std::string>());
|
||||
|
||||
collectionManager.drop_collection("coll1");
|
||||
}
|
||||
|
||||
TEST_F(CollectionSpecificTest, TokensSpreadAcrossFields) {
|
||||
std::vector<field> fields = {field("title", field_types::STRING, false),
|
||||
field("description", field_types::STRING, false),
|
||||
field("points", field_types::INT32, false),};
|
||||
|
||||
Collection* coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
|
||||
|
||||
nlohmann::json doc1;
|
||||
doc1["id"] = "0";
|
||||
doc1["title"] = "Foo bar baz";
|
||||
doc1["description"] = "Share information with this device.";
|
||||
doc1["points"] = 100;
|
||||
|
||||
nlohmann::json doc2;
|
||||
doc2["id"] = "1";
|
||||
doc2["title"] = "Foo Random";
|
||||
doc2["description"] = "The Bar Fox";
|
||||
doc2["points"] = 250;
|
||||
|
||||
ASSERT_TRUE(coll1->add(doc1.dump()).ok());
|
||||
ASSERT_TRUE(coll1->add(doc2.dump()).ok());
|
||||
|
||||
auto results = coll1->search("foo bar", {"title", "description"}, "", {}, {}, {0}, 10,
|
||||
1, FREQUENCY, {false, false},
|
||||
10, spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0,
|
||||
"<mark>", "</mark>", {4, 1}).get();
|
||||
|
||||
LOG(INFO) << results;
|
||||
|
||||
ASSERT_EQ("0", results["hits"][0]["document"]["id"].get<std::string>());
|
||||
ASSERT_EQ("1", results["hits"][1]["document"]["id"].get<std::string>());
|
||||
|
||||
collectionManager.drop_collection("coll1");
|
||||
}
|
||||
|
@ -292,7 +292,7 @@ TEST_F(CollectionTest, SkipUnindexedTokensDuringPhraseSearch) {
|
||||
}
|
||||
|
||||
results.clear();
|
||||
results = collection->search("the a DoesNotExist", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, {false}, 0).get();
|
||||
results = collection->search("the a insurance", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, {false}, 0).get();
|
||||
ASSERT_EQ(0, results["hits"].size());
|
||||
|
||||
// with no indexed word
|
||||
@ -859,7 +859,10 @@ TEST_F(CollectionTest, MultipleFields) {
|
||||
// when "starring" takes higher priority than "title"
|
||||
|
||||
query_fields = {"starring", "title"};
|
||||
results = coll_mul_fields->search("thomas", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, {false}).get();
|
||||
results = coll_mul_fields->search("thomas", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, {false},
|
||||
10, spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0,
|
||||
"<mark>", "</mark>", {2, 1}).get();
|
||||
ASSERT_EQ(4, results["hits"].size());
|
||||
|
||||
ids = {"15", "12", "13", "14"};
|
||||
@ -2969,6 +2972,17 @@ TEST_F(CollectionTest, MultiFieldRelevance2) {
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0,
|
||||
"<mark>", "</mark>", {1, 4}).get();
|
||||
|
||||
ASSERT_STREQ("0", results["hits"][0]["document"]["id"].get<std::string>().c_str());
|
||||
ASSERT_STREQ("1", results["hits"][1]["document"]["id"].get<std::string>().c_str());
|
||||
|
||||
// use same weights
|
||||
|
||||
results = coll1->search("on a jetplane",
|
||||
{"title", "artist"}, "", {}, {}, {0}, 10, 1, FREQUENCY,
|
||||
{true}, 10, spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0,
|
||||
"<mark>", "</mark>", {1, 1}).get();
|
||||
|
||||
ASSERT_STREQ("1", results["hits"][0]["document"]["id"].get<std::string>().c_str());
|
||||
ASSERT_STREQ("0", results["hits"][1]["document"]["id"].get<std::string>().c_str());
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user