diff --git a/include/field.h b/include/field.h index 389e9eb0..6a27ae80 100644 --- a/include/field.h +++ b/include/field.h @@ -65,6 +65,16 @@ struct field { bool is_facet() const { return facet; } + + bool is_array() const { + return (type == field_types::STRING_ARRAY || type == field_types::INT32_ARRAY || + type == field_types::FLOAT_ARRAY || + type == field_types::INT64_ARRAY || type == field_types::BOOL_ARRAY); + } + + std::string faceted_name() const { + return (facet && !is_string()) ? "_fstr_" + name : name; + } }; struct filter { diff --git a/include/index.h b/include/index.h index a95d17b3..1e019997 100644 --- a/include/index.h +++ b/include/index.h @@ -252,15 +252,6 @@ public: // in the query that have the least individual hits one by one until enough results are found. static const int DROP_TOKENS_THRESHOLD = 10; - // strings under this length will be fully highlighted, instead of showing a snippet of relevant portion - enum {SNIPPET_STR_ABOVE_LEN = 30}; - - // Using a $ prefix so that these meta keys stay above record entries in a lexicographically ordered KV store - static constexpr const char* COLLECTION_META_PREFIX = "$CM"; - static constexpr const char* COLLECTION_NEXT_SEQ_PREFIX = "$CS"; - static constexpr const char* SEQ_ID_PREFIX = "$SI"; - static constexpr const char* DOC_ID_PREFIX = "$DI"; - /* * Concurrency Primitives */ diff --git a/src/collection.cpp b/src/collection.cpp index 6fbe185b..9852ed23 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -749,6 +749,26 @@ Option Collection::search(const std::string & query, const std:: value = document[a_facet.field_name]; } else if(facet_schema.at(a_facet.field_name).type == field_types::STRING_ARRAY) { value = document[a_facet.field_name][facet_count.array_pos]; + } else if(facet_schema.at(a_facet.field_name).type == field_types::INT32) { + value = std::to_string(document[a_facet.field_name].get()); + } else if(facet_schema.at(a_facet.field_name).type == field_types::INT32_ARRAY) { + value = std::to_string(document[a_facet.field_name][facet_count.array_pos].get()); + } else if(facet_schema.at(a_facet.field_name).type == field_types::INT64) { + value = std::to_string(document[a_facet.field_name].get()); + } else if(facet_schema.at(a_facet.field_name).type == field_types::INT64_ARRAY) { + value = std::to_string(document[a_facet.field_name][facet_count.array_pos].get()); + } else if(facet_schema.at(a_facet.field_name).type == field_types::FLOAT) { + value = std::to_string(document[a_facet.field_name].get()); + value.erase ( value.find_last_not_of('0') + 1, std::string::npos ); // remove trailing zeros + } else if(facet_schema.at(a_facet.field_name).type == field_types::FLOAT_ARRAY) { + value = std::to_string(document[a_facet.field_name][facet_count.array_pos].get()); + value.erase ( value.find_last_not_of('0') + 1, std::string::npos ); // remove trailing zeros + } else if(facet_schema.at(a_facet.field_name).type == field_types::BOOL) { + value = std::to_string(document[a_facet.field_name].get()); + value = (value == "1") ? "true" : "false"; + } else if(facet_schema.at(a_facet.field_name).type == field_types::BOOL_ARRAY) { + value = std::to_string(document[a_facet.field_name][facet_count.array_pos].get()); + value = (value == "1") ? "true" : "false"; } std::vector tokens; diff --git a/src/index.cpp b/src/index.cpp index 3d3717c5..93571f64 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -15,10 +15,16 @@ Index::Index(const std::string name, const std::unordered_map Index::index_in_memory(const nlohmann::json &document, uint32_t // assumes that validation has already been done for(const std::pair & field_pair: search_schema) { const std::string & field_name = field_pair.first; - art_tree *t = search_index.at(field_name); int facet_id = -1; if(facet_schema.count(field_name) != 0) { facet_id = facet_to_id[field_name]; } + // non-string faceted field should be indexed as faceted string field as well + if(field_pair.second.facet && !field_pair.second.is_string()) { + art_tree *t = search_index.at(field_pair.second.faceted_name()); + + if(field_pair.second.is_array()) { + std::vector strings; + + if(field_pair.second.type == field_types::INT32_ARRAY) { + for(int32_t value: document[field_name]){ + strings.push_back(std::to_string(value)); + } + } else if(field_pair.second.type == field_types::INT64_ARRAY) { + for(int64_t value: document[field_name]){ + strings.push_back(std::to_string(value)); + } + } else if(field_pair.second.type == field_types::FLOAT_ARRAY) { + for(float value: document[field_name]){ + strings.push_back(std::to_string(value)); + } + } else if(field_pair.second.type == field_types::BOOL_ARRAY) { + for(bool value: document[field_name]){ + strings.push_back(std::to_string(value)); + } + } + index_string_array_field(strings, points, t, seq_id, facet_id); + } else { + std::string text; + + if(field_pair.second.type == field_types::INT32) { + text = std::to_string(document[field_name].get()); + } else if(field_pair.second.type == field_types::INT64) { + text = std::to_string(document[field_name].get()); + } else if(field_pair.second.type == field_types::FLOAT) { + text = std::to_string(document[field_name].get()); + } else if(field_pair.second.type == field_types::BOOL) { + text = std::to_string(document[field_name].get()); + } + + index_string_field(text, points, t, seq_id, facet_id); + } + } + + art_tree *t = search_index.at(field_name); + if(field_pair.second.type == field_types::STRING) { const std::string & text = document[field_name]; index_string_field(text, points, t, seq_id, facet_id); @@ -241,6 +290,7 @@ Option Index::validate_index_in_memory(const nlohmann::json &document, } } + // since every facet field has to be a search field, we don't have to revalidate types here for(const std::pair & field_pair: facet_schema) { const std::string & field_name = field_pair.first; @@ -248,22 +298,6 @@ Option Index::validate_index_in_memory(const nlohmann::json &document, return Option<>(400, "Field `" + field_name + "` has been declared as a facet field in the schema, " "but is not found in the document."); } - - if(field_pair.second.type == field_types::STRING) { - if(!document[field_name].is_string()) { - return Option<>(400, "Facet field `" + field_name + "` must be a string."); - } - } else if(field_pair.second.type == field_types::STRING_ARRAY) { - if(!document[field_name].is_array()) { - return Option<>(400, "Facet field `" + field_name + "` must be a string array."); - } - - if(document[field_name].size() > 0 && !document[field_name][0].is_string()) { - return Option<>(400, "Facet field `" + field_name + "` must be a string array."); - } - } else { - return Option<>(400, "Facet field `" + field_name + "` must be a string or a string[]."); - } } return Option<>(200); @@ -384,7 +418,6 @@ void Index::index_bool_field(const bool value, const uint32_t score, art_tree *t const int KEY_LEN = 1; unsigned char key[KEY_LEN]; key[0] = value ? '1' : '0'; - //key[1] = '\0'; uint32_t num_hits = 0; art_leaf* leaf = (art_leaf *) art_search(t, key, KEY_LEN); @@ -548,9 +581,21 @@ void Index::do_facets(std::vector & facets, facet_query_t & facet_query, if(a_facet.field_name == facet_query.field_name && !facet_query.query.empty()) { use_facet_query = true; + const field & facet_field = facet_schema.at(a_facet.field_name); + + if(facet_field.is_bool()) { + if(facet_query.query == "true") { + facet_query.query = "1"; + } else if(facet_query.query == "false") { + facet_query.query = "0"; + } + } + std::vector query_tokens; StringUtils::split(facet_query.query, query_tokens, " "); + art_tree *t = search_index.at(facet_field.faceted_name()); + for(size_t qtoken_index = 0; qtoken_index < query_tokens.size(); qtoken_index++) { auto & q = query_tokens[qtoken_index]; string_utils.unicode_normalize(q); @@ -558,9 +603,11 @@ void Index::do_facets(std::vector & facets, facet_query_t & facet_query, bool prefix_search = (qtoken_index == (query_tokens.size()-1)); // only last token must be used as prefix std::vector leaves; - art_fuzzy_search(search_index.at(a_facet.field_name), (const unsigned char *) q.c_str(), + + art_fuzzy_search(t, (const unsigned char *) q.c_str(), q.size(), 0, bounded_cost, 10000, token_ordering::MAX_SCORE, prefix_search, leaves); + for(size_t i = 0; i < leaves.size(); i++) { const auto & leaf = leaves[i]; // calculate hash without terminating null char diff --git a/test/collection_test.cpp b/test/collection_test.cpp index b2e47020..ecbd90fc 100644 --- a/test/collection_test.cpp +++ b/test/collection_test.cpp @@ -1773,9 +1773,10 @@ TEST_F(CollectionTest, FacetCounts) { std::ifstream infile(std::string(ROOT_DIR)+"test/numeric_array_documents.jsonl"); std::vector fields = {field("name", field_types::STRING, false), field("name_facet", field_types::STRING, true), - field("age", field_types::INT32, false), - field("years", field_types::INT32_ARRAY, false), - field("timestamps", field_types::INT64_ARRAY, false), + field("age", field_types::INT32, true), + field("years", field_types::INT32_ARRAY, true), + field("rating", field_types::FLOAT, true), + field("timestamps", field_types::INT64_ARRAY, true), field("tags", field_types::STRING_ARRAY, true)}; std::vector sort_fields = { sort_by("age", "DESC") }; @@ -1938,8 +1939,53 @@ TEST_F(CollectionTest, FacetCounts) { ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][0]["count"]); ASSERT_STREQ("FINE PLATINUM", results["facet_counts"][0]["counts"][0]["value"].get().c_str()); + // facet query on an integer field + results = coll_array_fields->search("*", query_fields, "", {"age"}, sort_fields, 0, 10, 1, FREQUENCY, + false, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, 500, "age: 2").get(); + + ASSERT_EQ(5, results["hits"].size()); + ASSERT_EQ(1, results["facet_counts"].size()); + ASSERT_STREQ("age", results["facet_counts"][0]["field_name"].get().c_str()); + ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][0]["count"]); + ASSERT_STREQ("24", results["facet_counts"][0]["counts"][0]["value"].get().c_str()); + ASSERT_STREQ("24", results["facet_counts"][0]["counts"][0]["highlighted"].get().c_str()); + + ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][1]["count"]); + ASSERT_STREQ("21", results["facet_counts"][0]["counts"][1]["value"].get().c_str()); + ASSERT_STREQ("21", results["facet_counts"][0]["counts"][1]["highlighted"].get().c_str()); + + // facet query on a float field + results = coll_array_fields->search("*", query_fields, "", {"rating"}, sort_fields, 0, 10, 1, FREQUENCY, + false, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, 500, "rating: 7").get(); + + ASSERT_EQ(5, results["hits"].size()); + ASSERT_EQ(1, results["facet_counts"].size()); + ASSERT_STREQ("rating", results["facet_counts"][0]["field_name"].get().c_str()); + ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][0]["count"]); + ASSERT_STREQ("7.812", results["facet_counts"][0]["counts"][0]["value"].get().c_str()); + ASSERT_STREQ("7.812", results["facet_counts"][0]["counts"][0]["highlighted"].get().c_str()); + + // facet query on a arrary integer field + + results = coll_array_fields->search("*", query_fields, "", {"timestamps"}, sort_fields, 0, 10, 1, FREQUENCY, + false, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, 500, "timestamps: 142189002").get(); + + ASSERT_EQ(5, results["hits"].size()); + ASSERT_EQ(1, results["facet_counts"].size()); + ASSERT_EQ(1, results["facet_counts"][0]["counts"].size()); + ASSERT_STREQ("timestamps", results["facet_counts"][0]["field_name"].get().c_str()); + ASSERT_EQ(2, (int) results["facet_counts"][0]["counts"][0]["count"]); + ASSERT_STREQ("1421890022", results["facet_counts"][0]["counts"][0]["value"].get().c_str()); + ASSERT_STREQ("1421890022", results["facet_counts"][0]["counts"][0]["highlighted"].get().c_str()); + // facet query that does not match any indexed value - results = coll_array_fields->search("*", query_fields, "", facets, sort_fields, 0, 10, 1, FREQUENCY, + results = coll_array_fields->search("*", query_fields, "", {facets}, sort_fields, 0, 10, 1, FREQUENCY, false, Index::DROP_TOKENS_THRESHOLD, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, 500, " tags : notfound").get(); @@ -2101,6 +2147,60 @@ TEST_F(CollectionTest, FacetCountsHighlighting) { collectionManager.drop_collection("coll1"); } +TEST_F(CollectionTest, FacetCountsBool) { + Collection *coll1; + + std::vector fields = {field("title", field_types::STRING, false), + field("points", field_types::INT32, false), + field("in_stock", field_types::BOOL, true)}; + + std::vector sort_fields = {sort_by("points", "DESC")}; + + coll1 = collectionManager.get_collection("coll1"); + if (coll1 == nullptr) { + coll1 = collectionManager.create_collection("coll1", fields, "points").get(); + } + + nlohmann::json doc; + doc["id"] = "100"; + doc["title"] = "Ford Mustang"; + doc["points"] = 25; + doc["in_stock"] = true; + + coll1->add(doc.dump()); + + doc["id"] = "101"; + doc["title"] = "Tesla Model S"; + doc["points"] = 40; + doc["in_stock"] = false; + + coll1->add(doc.dump()); + + doc["id"] = "102"; + doc["title"] = "Chevrolet Beat"; + doc["points"] = 10; + doc["in_stock"] = true; + + coll1->add(doc.dump()); + + std::vector facets = {"in_stock"}; + + nlohmann::json results = coll1->search("*", {"title"}, "in_stock:true", facets, sort_fields, 0, 10, 1, + token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, 500, "in_stock:true").get(); + + ASSERT_EQ(1, results["facet_counts"].size()); + ASSERT_EQ(1, results["facet_counts"][0]["counts"].size()); + + ASSERT_STREQ("in_stock", results["facet_counts"][0]["field_name"].get().c_str()); + ASSERT_EQ(2, (int) results["facet_counts"][0]["counts"][0]["count"]); + ASSERT_STREQ("true", results["facet_counts"][0]["counts"][0]["value"].get().c_str()); + ASSERT_STREQ("true", + results["facet_counts"][0]["counts"][0]["highlighted"].get().c_str()); + + collectionManager.drop_collection("coll1"); +} + TEST_F(CollectionTest, SortingOrder) { Collection *coll_mul_fields;