mirror of
https://github.com/typesense/typesense.git
synced 2025-05-17 20:22:32 +08:00
Allow non-string fields to be facetable.
This commit is contained in:
parent
49fddb0ce6
commit
95c8fb7082
@ -65,6 +65,16 @@ struct field {
|
||||
bool is_facet() const {
|
||||
return facet;
|
||||
}
|
||||
|
||||
bool is_array() const {
|
||||
return (type == field_types::STRING_ARRAY || type == field_types::INT32_ARRAY ||
|
||||
type == field_types::FLOAT_ARRAY ||
|
||||
type == field_types::INT64_ARRAY || type == field_types::BOOL_ARRAY);
|
||||
}
|
||||
|
||||
std::string faceted_name() const {
|
||||
return (facet && !is_string()) ? "_fstr_" + name : name;
|
||||
}
|
||||
};
|
||||
|
||||
struct filter {
|
||||
|
@ -252,15 +252,6 @@ public:
|
||||
// in the query that have the least individual hits one by one until enough results are found.
|
||||
static const int DROP_TOKENS_THRESHOLD = 10;
|
||||
|
||||
// strings under this length will be fully highlighted, instead of showing a snippet of relevant portion
|
||||
enum {SNIPPET_STR_ABOVE_LEN = 30};
|
||||
|
||||
// Using a $ prefix so that these meta keys stay above record entries in a lexicographically ordered KV store
|
||||
static constexpr const char* COLLECTION_META_PREFIX = "$CM";
|
||||
static constexpr const char* COLLECTION_NEXT_SEQ_PREFIX = "$CS";
|
||||
static constexpr const char* SEQ_ID_PREFIX = "$SI";
|
||||
static constexpr const char* DOC_ID_PREFIX = "$DI";
|
||||
|
||||
/*
|
||||
* Concurrency Primitives
|
||||
*/
|
||||
|
@ -749,6 +749,26 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
|
||||
value = document[a_facet.field_name];
|
||||
} else if(facet_schema.at(a_facet.field_name).type == field_types::STRING_ARRAY) {
|
||||
value = document[a_facet.field_name][facet_count.array_pos];
|
||||
} else if(facet_schema.at(a_facet.field_name).type == field_types::INT32) {
|
||||
value = std::to_string(document[a_facet.field_name].get<int32_t>());
|
||||
} else if(facet_schema.at(a_facet.field_name).type == field_types::INT32_ARRAY) {
|
||||
value = std::to_string(document[a_facet.field_name][facet_count.array_pos].get<int32_t>());
|
||||
} else if(facet_schema.at(a_facet.field_name).type == field_types::INT64) {
|
||||
value = std::to_string(document[a_facet.field_name].get<int64_t>());
|
||||
} else if(facet_schema.at(a_facet.field_name).type == field_types::INT64_ARRAY) {
|
||||
value = std::to_string(document[a_facet.field_name][facet_count.array_pos].get<int64_t>());
|
||||
} else if(facet_schema.at(a_facet.field_name).type == field_types::FLOAT) {
|
||||
value = std::to_string(document[a_facet.field_name].get<float>());
|
||||
value.erase ( value.find_last_not_of('0') + 1, std::string::npos ); // remove trailing zeros
|
||||
} else if(facet_schema.at(a_facet.field_name).type == field_types::FLOAT_ARRAY) {
|
||||
value = std::to_string(document[a_facet.field_name][facet_count.array_pos].get<float>());
|
||||
value.erase ( value.find_last_not_of('0') + 1, std::string::npos ); // remove trailing zeros
|
||||
} else if(facet_schema.at(a_facet.field_name).type == field_types::BOOL) {
|
||||
value = std::to_string(document[a_facet.field_name].get<bool>());
|
||||
value = (value == "1") ? "true" : "false";
|
||||
} else if(facet_schema.at(a_facet.field_name).type == field_types::BOOL_ARRAY) {
|
||||
value = std::to_string(document[a_facet.field_name][facet_count.array_pos].get<bool>());
|
||||
value = (value == "1") ? "true" : "false";
|
||||
}
|
||||
|
||||
std::vector<std::string> tokens;
|
||||
|
@ -15,10 +15,16 @@ Index::Index(const std::string name, const std::unordered_map<std::string, field
|
||||
name(name), search_schema(search_schema), facet_schema(facet_schema), sort_schema(sort_schema) {
|
||||
|
||||
for(const auto & pair: search_schema) {
|
||||
// NOTE: facet fields are also part of search schema
|
||||
art_tree *t = new art_tree;
|
||||
art_tree_init(t);
|
||||
search_index.emplace(pair.first, t);
|
||||
|
||||
// initialize for non-string facet fields
|
||||
if(pair.second.facet && !pair.second.is_string()) {
|
||||
art_tree *ft = new art_tree;
|
||||
art_tree_init(ft);
|
||||
search_index.emplace(pair.second.faceted_name(), ft);
|
||||
}
|
||||
}
|
||||
|
||||
for(const auto & pair: sort_schema) {
|
||||
@ -86,13 +92,56 @@ Option<uint32_t> Index::index_in_memory(const nlohmann::json &document, uint32_t
|
||||
// assumes that validation has already been done
|
||||
for(const std::pair<std::string, field> & field_pair: search_schema) {
|
||||
const std::string & field_name = field_pair.first;
|
||||
art_tree *t = search_index.at(field_name);
|
||||
|
||||
int facet_id = -1;
|
||||
if(facet_schema.count(field_name) != 0) {
|
||||
facet_id = facet_to_id[field_name];
|
||||
}
|
||||
|
||||
// non-string faceted field should be indexed as faceted string field as well
|
||||
if(field_pair.second.facet && !field_pair.second.is_string()) {
|
||||
art_tree *t = search_index.at(field_pair.second.faceted_name());
|
||||
|
||||
if(field_pair.second.is_array()) {
|
||||
std::vector<std::string> strings;
|
||||
|
||||
if(field_pair.second.type == field_types::INT32_ARRAY) {
|
||||
for(int32_t value: document[field_name]){
|
||||
strings.push_back(std::to_string(value));
|
||||
}
|
||||
} else if(field_pair.second.type == field_types::INT64_ARRAY) {
|
||||
for(int64_t value: document[field_name]){
|
||||
strings.push_back(std::to_string(value));
|
||||
}
|
||||
} else if(field_pair.second.type == field_types::FLOAT_ARRAY) {
|
||||
for(float value: document[field_name]){
|
||||
strings.push_back(std::to_string(value));
|
||||
}
|
||||
} else if(field_pair.second.type == field_types::BOOL_ARRAY) {
|
||||
for(bool value: document[field_name]){
|
||||
strings.push_back(std::to_string(value));
|
||||
}
|
||||
}
|
||||
index_string_array_field(strings, points, t, seq_id, facet_id);
|
||||
} else {
|
||||
std::string text;
|
||||
|
||||
if(field_pair.second.type == field_types::INT32) {
|
||||
text = std::to_string(document[field_name].get<int32_t>());
|
||||
} else if(field_pair.second.type == field_types::INT64) {
|
||||
text = std::to_string(document[field_name].get<int64_t>());
|
||||
} else if(field_pair.second.type == field_types::FLOAT) {
|
||||
text = std::to_string(document[field_name].get<float>());
|
||||
} else if(field_pair.second.type == field_types::BOOL) {
|
||||
text = std::to_string(document[field_name].get<bool>());
|
||||
}
|
||||
|
||||
index_string_field(text, points, t, seq_id, facet_id);
|
||||
}
|
||||
}
|
||||
|
||||
art_tree *t = search_index.at(field_name);
|
||||
|
||||
if(field_pair.second.type == field_types::STRING) {
|
||||
const std::string & text = document[field_name];
|
||||
index_string_field(text, points, t, seq_id, facet_id);
|
||||
@ -241,6 +290,7 @@ Option<uint32_t> Index::validate_index_in_memory(const nlohmann::json &document,
|
||||
}
|
||||
}
|
||||
|
||||
// since every facet field has to be a search field, we don't have to revalidate types here
|
||||
for(const std::pair<std::string, field> & field_pair: facet_schema) {
|
||||
const std::string & field_name = field_pair.first;
|
||||
|
||||
@ -248,22 +298,6 @@ Option<uint32_t> Index::validate_index_in_memory(const nlohmann::json &document,
|
||||
return Option<>(400, "Field `" + field_name + "` has been declared as a facet field in the schema, "
|
||||
"but is not found in the document.");
|
||||
}
|
||||
|
||||
if(field_pair.second.type == field_types::STRING) {
|
||||
if(!document[field_name].is_string()) {
|
||||
return Option<>(400, "Facet field `" + field_name + "` must be a string.");
|
||||
}
|
||||
} else if(field_pair.second.type == field_types::STRING_ARRAY) {
|
||||
if(!document[field_name].is_array()) {
|
||||
return Option<>(400, "Facet field `" + field_name + "` must be a string array.");
|
||||
}
|
||||
|
||||
if(document[field_name].size() > 0 && !document[field_name][0].is_string()) {
|
||||
return Option<>(400, "Facet field `" + field_name + "` must be a string array.");
|
||||
}
|
||||
} else {
|
||||
return Option<>(400, "Facet field `" + field_name + "` must be a string or a string[].");
|
||||
}
|
||||
}
|
||||
|
||||
return Option<>(200);
|
||||
@ -384,7 +418,6 @@ void Index::index_bool_field(const bool value, const uint32_t score, art_tree *t
|
||||
const int KEY_LEN = 1;
|
||||
unsigned char key[KEY_LEN];
|
||||
key[0] = value ? '1' : '0';
|
||||
//key[1] = '\0';
|
||||
|
||||
uint32_t num_hits = 0;
|
||||
art_leaf* leaf = (art_leaf *) art_search(t, key, KEY_LEN);
|
||||
@ -548,9 +581,21 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
|
||||
|
||||
if(a_facet.field_name == facet_query.field_name && !facet_query.query.empty()) {
|
||||
use_facet_query = true;
|
||||
const field & facet_field = facet_schema.at(a_facet.field_name);
|
||||
|
||||
if(facet_field.is_bool()) {
|
||||
if(facet_query.query == "true") {
|
||||
facet_query.query = "1";
|
||||
} else if(facet_query.query == "false") {
|
||||
facet_query.query = "0";
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::string> query_tokens;
|
||||
StringUtils::split(facet_query.query, query_tokens, " ");
|
||||
|
||||
art_tree *t = search_index.at(facet_field.faceted_name());
|
||||
|
||||
for(size_t qtoken_index = 0; qtoken_index < query_tokens.size(); qtoken_index++) {
|
||||
auto & q = query_tokens[qtoken_index];
|
||||
string_utils.unicode_normalize(q);
|
||||
@ -558,9 +603,11 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
|
||||
bool prefix_search = (qtoken_index == (query_tokens.size()-1)); // only last token must be used as prefix
|
||||
|
||||
std::vector<art_leaf*> leaves;
|
||||
art_fuzzy_search(search_index.at(a_facet.field_name), (const unsigned char *) q.c_str(),
|
||||
|
||||
art_fuzzy_search(t, (const unsigned char *) q.c_str(),
|
||||
q.size(), 0, bounded_cost, 10000,
|
||||
token_ordering::MAX_SCORE, prefix_search, leaves);
|
||||
|
||||
for(size_t i = 0; i < leaves.size(); i++) {
|
||||
const auto & leaf = leaves[i];
|
||||
// calculate hash without terminating null char
|
||||
|
@ -1773,9 +1773,10 @@ TEST_F(CollectionTest, FacetCounts) {
|
||||
std::ifstream infile(std::string(ROOT_DIR)+"test/numeric_array_documents.jsonl");
|
||||
std::vector<field> fields = {field("name", field_types::STRING, false),
|
||||
field("name_facet", field_types::STRING, true),
|
||||
field("age", field_types::INT32, false),
|
||||
field("years", field_types::INT32_ARRAY, false),
|
||||
field("timestamps", field_types::INT64_ARRAY, false),
|
||||
field("age", field_types::INT32, true),
|
||||
field("years", field_types::INT32_ARRAY, true),
|
||||
field("rating", field_types::FLOAT, true),
|
||||
field("timestamps", field_types::INT64_ARRAY, true),
|
||||
field("tags", field_types::STRING_ARRAY, true)};
|
||||
|
||||
std::vector<sort_by> sort_fields = { sort_by("age", "DESC") };
|
||||
@ -1938,8 +1939,53 @@ TEST_F(CollectionTest, FacetCounts) {
|
||||
ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][0]["count"]);
|
||||
ASSERT_STREQ("FINE PLATINUM", results["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
|
||||
|
||||
// facet query on an integer field
|
||||
results = coll_array_fields->search("*", query_fields, "", {"age"}, sort_fields, 0, 10, 1, FREQUENCY,
|
||||
false, Index::DROP_TOKENS_THRESHOLD,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, 500, "age: 2").get();
|
||||
|
||||
ASSERT_EQ(5, results["hits"].size());
|
||||
ASSERT_EQ(1, results["facet_counts"].size());
|
||||
ASSERT_STREQ("age", results["facet_counts"][0]["field_name"].get<std::string>().c_str());
|
||||
ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][0]["count"]);
|
||||
ASSERT_STREQ("24", results["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
|
||||
ASSERT_STREQ("<mark>2</mark>4", results["facet_counts"][0]["counts"][0]["highlighted"].get<std::string>().c_str());
|
||||
|
||||
ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][1]["count"]);
|
||||
ASSERT_STREQ("21", results["facet_counts"][0]["counts"][1]["value"].get<std::string>().c_str());
|
||||
ASSERT_STREQ("<mark>2</mark>1", results["facet_counts"][0]["counts"][1]["highlighted"].get<std::string>().c_str());
|
||||
|
||||
// facet query on a float field
|
||||
results = coll_array_fields->search("*", query_fields, "", {"rating"}, sort_fields, 0, 10, 1, FREQUENCY,
|
||||
false, Index::DROP_TOKENS_THRESHOLD,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, 500, "rating: 7").get();
|
||||
|
||||
ASSERT_EQ(5, results["hits"].size());
|
||||
ASSERT_EQ(1, results["facet_counts"].size());
|
||||
ASSERT_STREQ("rating", results["facet_counts"][0]["field_name"].get<std::string>().c_str());
|
||||
ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][0]["count"]);
|
||||
ASSERT_STREQ("7.812", results["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
|
||||
ASSERT_STREQ("<mark>7</mark>.812", results["facet_counts"][0]["counts"][0]["highlighted"].get<std::string>().c_str());
|
||||
|
||||
// facet query on a arrary integer field
|
||||
|
||||
results = coll_array_fields->search("*", query_fields, "", {"timestamps"}, sort_fields, 0, 10, 1, FREQUENCY,
|
||||
false, Index::DROP_TOKENS_THRESHOLD,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, 500, "timestamps: 142189002").get();
|
||||
|
||||
ASSERT_EQ(5, results["hits"].size());
|
||||
ASSERT_EQ(1, results["facet_counts"].size());
|
||||
ASSERT_EQ(1, results["facet_counts"][0]["counts"].size());
|
||||
ASSERT_STREQ("timestamps", results["facet_counts"][0]["field_name"].get<std::string>().c_str());
|
||||
ASSERT_EQ(2, (int) results["facet_counts"][0]["counts"][0]["count"]);
|
||||
ASSERT_STREQ("1421890022", results["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
|
||||
ASSERT_STREQ("<mark>142189002</mark>2", results["facet_counts"][0]["counts"][0]["highlighted"].get<std::string>().c_str());
|
||||
|
||||
// facet query that does not match any indexed value
|
||||
results = coll_array_fields->search("*", query_fields, "", facets, sort_fields, 0, 10, 1, FREQUENCY,
|
||||
results = coll_array_fields->search("*", query_fields, "", {facets}, sort_fields, 0, 10, 1, FREQUENCY,
|
||||
false, Index::DROP_TOKENS_THRESHOLD,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, 500, " tags : notfound").get();
|
||||
@ -2101,6 +2147,60 @@ TEST_F(CollectionTest, FacetCountsHighlighting) {
|
||||
collectionManager.drop_collection("coll1");
|
||||
}
|
||||
|
||||
TEST_F(CollectionTest, FacetCountsBool) {
|
||||
Collection *coll1;
|
||||
|
||||
std::vector<field> fields = {field("title", field_types::STRING, false),
|
||||
field("points", field_types::INT32, false),
|
||||
field("in_stock", field_types::BOOL, true)};
|
||||
|
||||
std::vector<sort_by> sort_fields = {sort_by("points", "DESC")};
|
||||
|
||||
coll1 = collectionManager.get_collection("coll1");
|
||||
if (coll1 == nullptr) {
|
||||
coll1 = collectionManager.create_collection("coll1", fields, "points").get();
|
||||
}
|
||||
|
||||
nlohmann::json doc;
|
||||
doc["id"] = "100";
|
||||
doc["title"] = "Ford Mustang";
|
||||
doc["points"] = 25;
|
||||
doc["in_stock"] = true;
|
||||
|
||||
coll1->add(doc.dump());
|
||||
|
||||
doc["id"] = "101";
|
||||
doc["title"] = "Tesla Model S";
|
||||
doc["points"] = 40;
|
||||
doc["in_stock"] = false;
|
||||
|
||||
coll1->add(doc.dump());
|
||||
|
||||
doc["id"] = "102";
|
||||
doc["title"] = "Chevrolet Beat";
|
||||
doc["points"] = 10;
|
||||
doc["in_stock"] = true;
|
||||
|
||||
coll1->add(doc.dump());
|
||||
|
||||
std::vector<std::string> facets = {"in_stock"};
|
||||
|
||||
nlohmann::json results = coll1->search("*", {"title"}, "in_stock:true", facets, sort_fields, 0, 10, 1,
|
||||
token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, 500, "in_stock:true").get();
|
||||
|
||||
ASSERT_EQ(1, results["facet_counts"].size());
|
||||
ASSERT_EQ(1, results["facet_counts"][0]["counts"].size());
|
||||
|
||||
ASSERT_STREQ("in_stock", results["facet_counts"][0]["field_name"].get<std::string>().c_str());
|
||||
ASSERT_EQ(2, (int) results["facet_counts"][0]["counts"][0]["count"]);
|
||||
ASSERT_STREQ("true", results["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
|
||||
ASSERT_STREQ("<mark>true</mark>",
|
||||
results["facet_counts"][0]["counts"][0]["highlighted"].get<std::string>().c_str());
|
||||
|
||||
collectionManager.drop_collection("coll1");
|
||||
}
|
||||
|
||||
TEST_F(CollectionTest, SortingOrder) {
|
||||
Collection *coll_mul_fields;
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user