Allow non-string fields to be facetable.

This commit is contained in:
kishorenc 2020-02-26 22:50:19 +05:30
parent 49fddb0ce6
commit 95c8fb7082
5 changed files with 201 additions and 33 deletions

View File

@ -65,6 +65,16 @@ struct field {
bool is_facet() const {
return facet;
}
bool is_array() const {
return (type == field_types::STRING_ARRAY || type == field_types::INT32_ARRAY ||
type == field_types::FLOAT_ARRAY ||
type == field_types::INT64_ARRAY || type == field_types::BOOL_ARRAY);
}
std::string faceted_name() const {
return (facet && !is_string()) ? "_fstr_" + name : name;
}
};
struct filter {

View File

@ -252,15 +252,6 @@ public:
// in the query that have the least individual hits one by one until enough results are found.
static const int DROP_TOKENS_THRESHOLD = 10;
// strings under this length will be fully highlighted, instead of showing a snippet of relevant portion
enum {SNIPPET_STR_ABOVE_LEN = 30};
// Using a $ prefix so that these meta keys stay above record entries in a lexicographically ordered KV store
static constexpr const char* COLLECTION_META_PREFIX = "$CM";
static constexpr const char* COLLECTION_NEXT_SEQ_PREFIX = "$CS";
static constexpr const char* SEQ_ID_PREFIX = "$SI";
static constexpr const char* DOC_ID_PREFIX = "$DI";
/*
* Concurrency Primitives
*/

View File

@ -749,6 +749,26 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
value = document[a_facet.field_name];
} else if(facet_schema.at(a_facet.field_name).type == field_types::STRING_ARRAY) {
value = document[a_facet.field_name][facet_count.array_pos];
} else if(facet_schema.at(a_facet.field_name).type == field_types::INT32) {
value = std::to_string(document[a_facet.field_name].get<int32_t>());
} else if(facet_schema.at(a_facet.field_name).type == field_types::INT32_ARRAY) {
value = std::to_string(document[a_facet.field_name][facet_count.array_pos].get<int32_t>());
} else if(facet_schema.at(a_facet.field_name).type == field_types::INT64) {
value = std::to_string(document[a_facet.field_name].get<int64_t>());
} else if(facet_schema.at(a_facet.field_name).type == field_types::INT64_ARRAY) {
value = std::to_string(document[a_facet.field_name][facet_count.array_pos].get<int64_t>());
} else if(facet_schema.at(a_facet.field_name).type == field_types::FLOAT) {
value = std::to_string(document[a_facet.field_name].get<float>());
value.erase ( value.find_last_not_of('0') + 1, std::string::npos ); // remove trailing zeros
} else if(facet_schema.at(a_facet.field_name).type == field_types::FLOAT_ARRAY) {
value = std::to_string(document[a_facet.field_name][facet_count.array_pos].get<float>());
value.erase ( value.find_last_not_of('0') + 1, std::string::npos ); // remove trailing zeros
} else if(facet_schema.at(a_facet.field_name).type == field_types::BOOL) {
value = std::to_string(document[a_facet.field_name].get<bool>());
value = (value == "1") ? "true" : "false";
} else if(facet_schema.at(a_facet.field_name).type == field_types::BOOL_ARRAY) {
value = std::to_string(document[a_facet.field_name][facet_count.array_pos].get<bool>());
value = (value == "1") ? "true" : "false";
}
std::vector<std::string> tokens;

View File

@ -15,10 +15,16 @@ Index::Index(const std::string name, const std::unordered_map<std::string, field
name(name), search_schema(search_schema), facet_schema(facet_schema), sort_schema(sort_schema) {
for(const auto & pair: search_schema) {
// NOTE: facet fields are also part of search schema
art_tree *t = new art_tree;
art_tree_init(t);
search_index.emplace(pair.first, t);
// initialize for non-string facet fields
if(pair.second.facet && !pair.second.is_string()) {
art_tree *ft = new art_tree;
art_tree_init(ft);
search_index.emplace(pair.second.faceted_name(), ft);
}
}
for(const auto & pair: sort_schema) {
@ -86,13 +92,56 @@ Option<uint32_t> Index::index_in_memory(const nlohmann::json &document, uint32_t
// assumes that validation has already been done
for(const std::pair<std::string, field> & field_pair: search_schema) {
const std::string & field_name = field_pair.first;
art_tree *t = search_index.at(field_name);
int facet_id = -1;
if(facet_schema.count(field_name) != 0) {
facet_id = facet_to_id[field_name];
}
// non-string faceted field should be indexed as faceted string field as well
if(field_pair.second.facet && !field_pair.second.is_string()) {
art_tree *t = search_index.at(field_pair.second.faceted_name());
if(field_pair.second.is_array()) {
std::vector<std::string> strings;
if(field_pair.second.type == field_types::INT32_ARRAY) {
for(int32_t value: document[field_name]){
strings.push_back(std::to_string(value));
}
} else if(field_pair.second.type == field_types::INT64_ARRAY) {
for(int64_t value: document[field_name]){
strings.push_back(std::to_string(value));
}
} else if(field_pair.second.type == field_types::FLOAT_ARRAY) {
for(float value: document[field_name]){
strings.push_back(std::to_string(value));
}
} else if(field_pair.second.type == field_types::BOOL_ARRAY) {
for(bool value: document[field_name]){
strings.push_back(std::to_string(value));
}
}
index_string_array_field(strings, points, t, seq_id, facet_id);
} else {
std::string text;
if(field_pair.second.type == field_types::INT32) {
text = std::to_string(document[field_name].get<int32_t>());
} else if(field_pair.second.type == field_types::INT64) {
text = std::to_string(document[field_name].get<int64_t>());
} else if(field_pair.second.type == field_types::FLOAT) {
text = std::to_string(document[field_name].get<float>());
} else if(field_pair.second.type == field_types::BOOL) {
text = std::to_string(document[field_name].get<bool>());
}
index_string_field(text, points, t, seq_id, facet_id);
}
}
art_tree *t = search_index.at(field_name);
if(field_pair.second.type == field_types::STRING) {
const std::string & text = document[field_name];
index_string_field(text, points, t, seq_id, facet_id);
@ -241,6 +290,7 @@ Option<uint32_t> Index::validate_index_in_memory(const nlohmann::json &document,
}
}
// since every facet field has to be a search field, we don't have to revalidate types here
for(const std::pair<std::string, field> & field_pair: facet_schema) {
const std::string & field_name = field_pair.first;
@ -248,22 +298,6 @@ Option<uint32_t> Index::validate_index_in_memory(const nlohmann::json &document,
return Option<>(400, "Field `" + field_name + "` has been declared as a facet field in the schema, "
"but is not found in the document.");
}
if(field_pair.second.type == field_types::STRING) {
if(!document[field_name].is_string()) {
return Option<>(400, "Facet field `" + field_name + "` must be a string.");
}
} else if(field_pair.second.type == field_types::STRING_ARRAY) {
if(!document[field_name].is_array()) {
return Option<>(400, "Facet field `" + field_name + "` must be a string array.");
}
if(document[field_name].size() > 0 && !document[field_name][0].is_string()) {
return Option<>(400, "Facet field `" + field_name + "` must be a string array.");
}
} else {
return Option<>(400, "Facet field `" + field_name + "` must be a string or a string[].");
}
}
return Option<>(200);
@ -384,7 +418,6 @@ void Index::index_bool_field(const bool value, const uint32_t score, art_tree *t
const int KEY_LEN = 1;
unsigned char key[KEY_LEN];
key[0] = value ? '1' : '0';
//key[1] = '\0';
uint32_t num_hits = 0;
art_leaf* leaf = (art_leaf *) art_search(t, key, KEY_LEN);
@ -548,9 +581,21 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
if(a_facet.field_name == facet_query.field_name && !facet_query.query.empty()) {
use_facet_query = true;
const field & facet_field = facet_schema.at(a_facet.field_name);
if(facet_field.is_bool()) {
if(facet_query.query == "true") {
facet_query.query = "1";
} else if(facet_query.query == "false") {
facet_query.query = "0";
}
}
std::vector<std::string> query_tokens;
StringUtils::split(facet_query.query, query_tokens, " ");
art_tree *t = search_index.at(facet_field.faceted_name());
for(size_t qtoken_index = 0; qtoken_index < query_tokens.size(); qtoken_index++) {
auto & q = query_tokens[qtoken_index];
string_utils.unicode_normalize(q);
@ -558,9 +603,11 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
bool prefix_search = (qtoken_index == (query_tokens.size()-1)); // only last token must be used as prefix
std::vector<art_leaf*> leaves;
art_fuzzy_search(search_index.at(a_facet.field_name), (const unsigned char *) q.c_str(),
art_fuzzy_search(t, (const unsigned char *) q.c_str(),
q.size(), 0, bounded_cost, 10000,
token_ordering::MAX_SCORE, prefix_search, leaves);
for(size_t i = 0; i < leaves.size(); i++) {
const auto & leaf = leaves[i];
// calculate hash without terminating null char

View File

@ -1773,9 +1773,10 @@ TEST_F(CollectionTest, FacetCounts) {
std::ifstream infile(std::string(ROOT_DIR)+"test/numeric_array_documents.jsonl");
std::vector<field> fields = {field("name", field_types::STRING, false),
field("name_facet", field_types::STRING, true),
field("age", field_types::INT32, false),
field("years", field_types::INT32_ARRAY, false),
field("timestamps", field_types::INT64_ARRAY, false),
field("age", field_types::INT32, true),
field("years", field_types::INT32_ARRAY, true),
field("rating", field_types::FLOAT, true),
field("timestamps", field_types::INT64_ARRAY, true),
field("tags", field_types::STRING_ARRAY, true)};
std::vector<sort_by> sort_fields = { sort_by("age", "DESC") };
@ -1938,8 +1939,53 @@ TEST_F(CollectionTest, FacetCounts) {
ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][0]["count"]);
ASSERT_STREQ("FINE PLATINUM", results["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
// facet query on an integer field
results = coll_array_fields->search("*", query_fields, "", {"age"}, sort_fields, 0, 10, 1, FREQUENCY,
false, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, 500, "age: 2").get();
ASSERT_EQ(5, results["hits"].size());
ASSERT_EQ(1, results["facet_counts"].size());
ASSERT_STREQ("age", results["facet_counts"][0]["field_name"].get<std::string>().c_str());
ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][0]["count"]);
ASSERT_STREQ("24", results["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
ASSERT_STREQ("<mark>2</mark>4", results["facet_counts"][0]["counts"][0]["highlighted"].get<std::string>().c_str());
ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][1]["count"]);
ASSERT_STREQ("21", results["facet_counts"][0]["counts"][1]["value"].get<std::string>().c_str());
ASSERT_STREQ("<mark>2</mark>1", results["facet_counts"][0]["counts"][1]["highlighted"].get<std::string>().c_str());
// facet query on a float field
results = coll_array_fields->search("*", query_fields, "", {"rating"}, sort_fields, 0, 10, 1, FREQUENCY,
false, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, 500, "rating: 7").get();
ASSERT_EQ(5, results["hits"].size());
ASSERT_EQ(1, results["facet_counts"].size());
ASSERT_STREQ("rating", results["facet_counts"][0]["field_name"].get<std::string>().c_str());
ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][0]["count"]);
ASSERT_STREQ("7.812", results["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
ASSERT_STREQ("<mark>7</mark>.812", results["facet_counts"][0]["counts"][0]["highlighted"].get<std::string>().c_str());
// facet query on a arrary integer field
results = coll_array_fields->search("*", query_fields, "", {"timestamps"}, sort_fields, 0, 10, 1, FREQUENCY,
false, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, 500, "timestamps: 142189002").get();
ASSERT_EQ(5, results["hits"].size());
ASSERT_EQ(1, results["facet_counts"].size());
ASSERT_EQ(1, results["facet_counts"][0]["counts"].size());
ASSERT_STREQ("timestamps", results["facet_counts"][0]["field_name"].get<std::string>().c_str());
ASSERT_EQ(2, (int) results["facet_counts"][0]["counts"][0]["count"]);
ASSERT_STREQ("1421890022", results["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
ASSERT_STREQ("<mark>142189002</mark>2", results["facet_counts"][0]["counts"][0]["highlighted"].get<std::string>().c_str());
// facet query that does not match any indexed value
results = coll_array_fields->search("*", query_fields, "", facets, sort_fields, 0, 10, 1, FREQUENCY,
results = coll_array_fields->search("*", query_fields, "", {facets}, sort_fields, 0, 10, 1, FREQUENCY,
false, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, 500, " tags : notfound").get();
@ -2101,6 +2147,60 @@ TEST_F(CollectionTest, FacetCountsHighlighting) {
collectionManager.drop_collection("coll1");
}
TEST_F(CollectionTest, FacetCountsBool) {
Collection *coll1;
std::vector<field> fields = {field("title", field_types::STRING, false),
field("points", field_types::INT32, false),
field("in_stock", field_types::BOOL, true)};
std::vector<sort_by> sort_fields = {sort_by("points", "DESC")};
coll1 = collectionManager.get_collection("coll1");
if (coll1 == nullptr) {
coll1 = collectionManager.create_collection("coll1", fields, "points").get();
}
nlohmann::json doc;
doc["id"] = "100";
doc["title"] = "Ford Mustang";
doc["points"] = 25;
doc["in_stock"] = true;
coll1->add(doc.dump());
doc["id"] = "101";
doc["title"] = "Tesla Model S";
doc["points"] = 40;
doc["in_stock"] = false;
coll1->add(doc.dump());
doc["id"] = "102";
doc["title"] = "Chevrolet Beat";
doc["points"] = 10;
doc["in_stock"] = true;
coll1->add(doc.dump());
std::vector<std::string> facets = {"in_stock"};
nlohmann::json results = coll1->search("*", {"title"}, "in_stock:true", facets, sort_fields, 0, 10, 1,
token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, 500, "in_stock:true").get();
ASSERT_EQ(1, results["facet_counts"].size());
ASSERT_EQ(1, results["facet_counts"][0]["counts"].size());
ASSERT_STREQ("in_stock", results["facet_counts"][0]["field_name"].get<std::string>().c_str());
ASSERT_EQ(2, (int) results["facet_counts"][0]["counts"][0]["count"]);
ASSERT_STREQ("true", results["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
ASSERT_STREQ("<mark>true</mark>",
results["facet_counts"][0]["counts"][0]["highlighted"].get<std::string>().c_str());
collectionManager.drop_collection("coll1");
}
TEST_F(CollectionTest, SortingOrder) {
Collection *coll_mul_fields;