From 0bb8cf13bfb81a86be47c1605eab429e9a9b84d6 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Sat, 20 May 2017 17:02:14 +0530 Subject: [PATCH] Ensure that remove function really removes all auxillary indexing data structures. --- TODO.md | 3 +- include/string_utils.h | 40 ------------- src/collection.cpp | 127 ++++++++++++++++++++++++++++++----------- 3 files changed, 95 insertions(+), 75 deletions(-) diff --git a/TODO.md b/TODO.md index 4824e954..f9f3a64e 100644 --- a/TODO.md +++ b/TODO.md @@ -34,6 +34,8 @@ - ~~Found count is wrong~~ - ~~Filter query in the API~~ - ~~Facet limit (hardcode to top 10)~~ +- ~~Deprecate old split function~~ +- ID should not have "/" - Handle store-get() not finding a key - Fix API response codes - Test for search without any sort_by given @@ -41,7 +43,6 @@ - Test for collection creation validation - Test for delete document - Proper pagination -- Deprecate old split function - Prevent string copy during indexing - clean special chars before indexing - Minimum results should be a variable instead of blindly going with max_results diff --git a/include/string_utils.h b/include/string_utils.h index 200e3371..88998d66 100644 --- a/include/string_utils.h +++ b/include/string_utils.h @@ -5,46 +5,6 @@ #include struct StringUtils { - - template - static void tokenize(const std::string &str, ContainerT &tokens, - const std::string &delimiters = " ", bool trimEmpty = true, unsigned long maxTokenLength = 100) { - const std::string truncated_str = str.substr(0, maxTokenLength); - std::string::size_type pos, lastPos = 0; - - using value_type = typename ContainerT::value_type; - using size_type = typename ContainerT::size_type; - - while (true) { - pos = truncated_str.find_first_of(delimiters, lastPos); - if (pos == std::string::npos) { - pos = truncated_str.length(); - - if (pos != lastPos || !trimEmpty) - tokens.push_back(value_type(truncated_str.data() + lastPos, - (size_type) pos - lastPos)); - - break; - } - else { - if (pos != lastPos || !trimEmpty) - tokens.push_back(value_type(truncated_str.data() + lastPos, - (size_type) pos - lastPos)); - } - - lastPos = pos + 1; - } - } - - static std::string replace_all(std::string str, const std::string &from, const std::string &to) { - size_t start_pos = 0; - while ((start_pos = str.find(from, start_pos)) != std::string::npos) { - str.replace(start_pos, from.length(), to); - start_pos += to.length(); // Handles case where 'to' is a substring of 'from' - } - return str; - } - // Adapted from: http://stackoverflow.com/a/236180/131050 static void split(const std::string& s, std::vector & result, const std::string& delim, const bool keep_empty = false) { if (delim.empty()) { diff --git a/src/collection.cpp b/src/collection.cpp index 2ac48c2e..d808699a 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -269,7 +269,7 @@ void Collection::index_string_field(const std::string & text, const uint32_t sco tokens.push_back(text); token_to_offsets[text].push_back(0); } else { - StringUtils::tokenize(text, tokens, " ", true); + StringUtils::split(text, tokens, " "); for(uint32_t i=0; i &topster, uint32_t** all_result_ids, size_t & all_result_ids_len, const token_ordering token_order, const bool prefix) { std::vector tokens; - StringUtils::tokenize(query, tokens, " ", true); + StringUtils::split(query, tokens, " "); const int max_cost = (num_typos < 0 || num_typos > 2) ? 2 : num_typos; const size_t max_results = std::min(num_results, (size_t) Collection::MAX_RESULTS); @@ -1020,41 +1020,100 @@ Option Collection::remove(std::string id) { nlohmann::json document = nlohmann::json::parse(parsed_document); - std::vector tokens; - StringUtils::tokenize(document["title"], tokens, " ", true); - - for(auto token: tokens) { - std::transform(token.begin(), token.end(), token.begin(), ::tolower); - - const unsigned char *key = (const unsigned char *) token.c_str(); - int key_len = (int) (token.length() + 1); - - art_leaf* leaf = (art_leaf *) art_search(search_index.at("title"), key, key_len); - if(leaf != NULL) { - uint32_t seq_id_values[1] = {seq_id}; - - uint32_t doc_index = leaf->values->ids.indexOf(seq_id); - uint32_t start_offset = leaf->values->offset_index.at(doc_index); - uint32_t end_offset = (doc_index == leaf->values->ids.getLength() - 1) ? - leaf->values->offsets.getLength() : - leaf->values->offset_index.at(doc_index+1); - - uint32_t doc_indices[1] = {doc_index}; - remove_and_shift_offset_index(leaf->values->offset_index, doc_indices, 1); - - leaf->values->offsets.remove_index(start_offset, end_offset); - leaf->values->ids.remove_values(seq_id_values, 1); - - /*len = leaf->values->offset_index.getLength(); - for(auto i=0; ivalues->offset_index.at(i) << std::endl; + for(auto name_field: search_schema) { + std::vector tokens; + if(name_field.second.type == field_types::STRING) { + StringUtils::split(document[name_field.first], tokens, " "); + } else if(name_field.second.type == field_types::STRING_ARRAY) { + tokens = document[name_field.first].get>(); + } else if(name_field.second.type == field_types::INT32) { + const int KEY_LEN = 8; + unsigned char key[KEY_LEN]; + int32_t value = document[name_field.first].get(); + encode_int32(value, key); + tokens.push_back(std::string((char*)key, KEY_LEN)); + } else if(name_field.second.type == field_types::INT32_ARRAY) { + std::vector values = document[name_field.first].get>(); + for(const int32_t value: values) { + const int KEY_LEN = 8; + unsigned char key[KEY_LEN]; + encode_int32(value, key); + tokens.push_back(std::string((char*)key, KEY_LEN)); } - std::cout << "----" << std::endl;*/ - - if(leaf->values->ids.getLength() == 0) { - art_delete(search_index.at("title"), key, key_len); + } else if(name_field.second.type == field_types::INT64) { + const int KEY_LEN = 8; + unsigned char key[KEY_LEN]; + int64_t value = document[name_field.first].get(); + encode_int64(value, key); + tokens.push_back(std::string((char*)key, KEY_LEN)); + } else if(name_field.second.type == field_types::INT64_ARRAY) { + std::vector values = document[name_field.first].get>(); + for(const int64_t value: values) { + const int KEY_LEN = 8; + unsigned char key[KEY_LEN]; + encode_int64(value, key); + tokens.push_back(std::string((char*)key, KEY_LEN)); } } + + for(auto token: tokens) { + const unsigned char *key; + int key_len; + + if(name_field.second.type == field_types::STRING_ARRAY || name_field.second.type == field_types::STRING) { + std::transform(token.begin(), token.end(), token.begin(), ::tolower); + key = (const unsigned char *) token.c_str(); + key_len = (int) (token.length() + 1); + } else { + key = (const unsigned char *) token.c_str(); + key_len = (int) (token.length()); + } + + if(token == "https://twitter.com/yogalayout") { + std::cout << "token https://twitter.com/yogalayout" << std::endl; + } + + art_leaf* leaf = (art_leaf *) art_search(search_index.at(name_field.first), key, key_len); + if(leaf != NULL) { + uint32_t seq_id_values[1] = {seq_id}; + + if(leaf->values->ids.getLength() == 0) { + std::cout << "HEY!!!" << std::endl; + } + + uint32_t doc_index = leaf->values->ids.indexOf(seq_id); + uint32_t start_offset = leaf->values->offset_index.at(doc_index); + uint32_t end_offset = (doc_index == leaf->values->ids.getLength() - 1) ? + leaf->values->offsets.getLength() : + leaf->values->offset_index.at(doc_index+1); + + uint32_t doc_indices[1] = {doc_index}; + remove_and_shift_offset_index(leaf->values->offset_index, doc_indices, 1); + + leaf->values->offsets.remove_index(start_offset, end_offset); + leaf->values->ids.remove_values(seq_id_values, 1); + + /*len = leaf->values->offset_index.getLength(); + for(auto i=0; ivalues->offset_index.at(i) << std::endl; + } + std::cout << "----" << std::endl;*/ + + if(leaf->values->ids.getLength() == 0) { + art_delete(search_index.at(name_field.first), key, key_len); + } + } + } + } + + // remove facets if any + for(auto field_facet_value: facet_index) { + field_facet_value.second.doc_values.erase(seq_id); + } + + // remove sort index if any + for(auto field_doc_value_map: sort_index) { + field_doc_value_map.second->erase(seq_id); } store->remove(get_doc_id_key(id));