From 0bb8cf13bfb81a86be47c1605eab429e9a9b84d6 Mon Sep 17 00:00:00 2001
From: Kishore Nallan <kishore@kishorelive.com>
Date: Sat, 20 May 2017 17:02:14 +0530
Subject: [PATCH] Ensure that remove function really removes all auxillary
 indexing data structures.

---
 TODO.md                |   3 +-
 include/string_utils.h |  40 -------------
 src/collection.cpp     | 127 ++++++++++++++++++++++++++++++-----------
 3 files changed, 95 insertions(+), 75 deletions(-)
diff --git a/TODO.md b/TODO.md
index 4824e954..f9f3a64e 100644
--- a/TODO.md
+++ b/TODO.md
@@ -34,6 +34,8 @@
 - ~~Found count is wrong~~
 - ~~Filter query in the API~~
 - ~~Facet limit (hardcode to top 10)~~
+- ~~Deprecate old split function~~
+- ID should not have "/"
 - Handle store-get() not finding a key
 - Fix API response codes
 - Test for search without any sort_by given
@@ -41,7 +43,6 @@
 - Test for collection creation validation
 - Test for delete document
 - Proper pagination
-- Deprecate old split function
 - Prevent string copy during indexing
 - clean special chars before indexing
 - Minimum results should be a variable instead of blindly going with max_results
diff --git a/include/string_utils.h b/include/string_utils.h
index 200e3371..88998d66 100644
--- a/include/string_utils.h
+++ b/include/string_utils.h
@@ -5,46 +5,6 @@
 #include <sstream>
 
 struct StringUtils {
-
-    template<class ContainerT>
-    static void tokenize(const std::string &str, ContainerT &tokens,
-                  const std::string &delimiters = " ", bool trimEmpty = true, unsigned long maxTokenLength = 100) {
-        const std::string truncated_str = str.substr(0, maxTokenLength);
-        std::string::size_type pos, lastPos = 0;
-
-        using value_type = typename ContainerT::value_type;
-        using size_type  = typename ContainerT::size_type;
-
-        while (true) {
-            pos = truncated_str.find_first_of(delimiters, lastPos);
-            if (pos == std::string::npos) {
-                pos = truncated_str.length();
-
-                if (pos != lastPos || !trimEmpty)
-                    tokens.push_back(value_type(truncated_str.data() + lastPos,
-                                                (size_type) pos - lastPos));
-
-                break;
-            }
-            else {
-                if (pos != lastPos || !trimEmpty)
-                    tokens.push_back(value_type(truncated_str.data() + lastPos,
-                                                (size_type) pos - lastPos));
-            }
-
-            lastPos = pos + 1;
-        }
-    }
-
-    static std::string replace_all(std::string str, const std::string &from, const std::string &to) {
-        size_t start_pos = 0;
-        while ((start_pos = str.find(from, start_pos)) != std::string::npos) {
-            str.replace(start_pos, from.length(), to);
-            start_pos += to.length(); // Handles case where 'to' is a substring of 'from'
-        }
-        return str;
-    }
-
     // Adapted from: http://stackoverflow.com/a/236180/131050
     static void split(const std::string& s, std::vector<std::string> & result, const std::string& delim, const bool keep_empty = false) {
         if (delim.empty()) {
diff --git a/src/collection.cpp b/src/collection.cpp
index 2ac48c2e..d808699a 100644
--- a/src/collection.cpp
+++ b/src/collection.cpp
@@ -269,7 +269,7 @@ void Collection::index_string_field(const std::string & text, const uint32_t sco
         tokens.push_back(text);
         token_to_offsets[text].push_back(0);
     } else {
-        StringUtils::tokenize(text, tokens, " ", true);
+        StringUtils::split(text, tokens, " ");
         for(uint32_t i=0; i<tokens.size(); i++) {
             auto token = tokens[i];
             transform(token.begin(), token.end(), token.begin(), tolower);
@@ -706,7 +706,7 @@ void Collection::search_field(std::string & query, const std::string & field, ui
                               const size_t num_results, Topster<100> &topster, uint32_t** all_result_ids,
                               size_t & all_result_ids_len, const token_ordering token_order, const bool prefix) {
     std::vector<std::string> tokens;
-    StringUtils::tokenize(query, tokens, " ", true);
+    StringUtils::split(query, tokens, " ");
 
     const int max_cost = (num_typos < 0 || num_typos > 2) ? 2 : num_typos;
     const size_t max_results = std::min(num_results, (size_t) Collection::MAX_RESULTS);
@@ -1020,41 +1020,100 @@ Option<std::string> Collection::remove(std::string id) {
 
     nlohmann::json document = nlohmann::json::parse(parsed_document);
 
-    std::vector<std::string> tokens;
-    StringUtils::tokenize(document["title"], tokens, " ", true);
-
-    for(auto token: tokens) {
-        std::transform(token.begin(), token.end(), token.begin(), ::tolower);
-
-        const unsigned char *key = (const unsigned char *) token.c_str();
-        int key_len = (int) (token.length() + 1);
-
-        art_leaf* leaf = (art_leaf *) art_search(search_index.at("title"), key, key_len);
-        if(leaf != NULL) {
-            uint32_t seq_id_values[1] = {seq_id};
-
-            uint32_t doc_index = leaf->values->ids.indexOf(seq_id);
-            uint32_t start_offset = leaf->values->offset_index.at(doc_index);
-            uint32_t end_offset = (doc_index == leaf->values->ids.getLength() - 1) ?
-                                  leaf->values->offsets.getLength() :
-                                  leaf->values->offset_index.at(doc_index+1);
-
-            uint32_t doc_indices[1] = {doc_index};
-            remove_and_shift_offset_index(leaf->values->offset_index, doc_indices, 1);
-
-            leaf->values->offsets.remove_index(start_offset, end_offset);
-            leaf->values->ids.remove_values(seq_id_values, 1);
-
-            /*len = leaf->values->offset_index.getLength();
-            for(auto i=0; i<len; i++) {
-                std::cout << "i: " << i << ", val: " << leaf->values->offset_index.at(i) << std::endl;
+    for(auto name_field: search_schema) {
+        std::vector<std::string> tokens;
+        if(name_field.second.type == field_types::STRING) {
+            StringUtils::split(document[name_field.first], tokens, " ");
+        } else if(name_field.second.type == field_types::STRING_ARRAY) {
+            tokens = document[name_field.first].get<std::vector<std::string>>();
+        } else if(name_field.second.type == field_types::INT32) {
+            const int KEY_LEN = 8;
+            unsigned char key[KEY_LEN];
+            int32_t value = document[name_field.first].get<int32_t>();
+            encode_int32(value, key);
+            tokens.push_back(std::string((char*)key, KEY_LEN));
+        } else if(name_field.second.type == field_types::INT32_ARRAY) {
+            std::vector<int32_t> values = document[name_field.first].get<std::vector<int32_t>>();
+            for(const int32_t value: values) {
+                const int KEY_LEN = 8;
+                unsigned char key[KEY_LEN];
+                encode_int32(value, key);
+                tokens.push_back(std::string((char*)key, KEY_LEN));
             }
-            std::cout << "----" << std::endl;*/
-
-            if(leaf->values->ids.getLength() == 0) {
-                art_delete(search_index.at("title"), key, key_len);
+        } else if(name_field.second.type == field_types::INT64) {
+            const int KEY_LEN = 8;
+            unsigned char key[KEY_LEN];
+            int64_t value = document[name_field.first].get<int64_t>();
+            encode_int64(value, key);
+            tokens.push_back(std::string((char*)key, KEY_LEN));
+        } else if(name_field.second.type == field_types::INT64_ARRAY) {
+            std::vector<int64_t> values = document[name_field.first].get<std::vector<int64_t>>();
+            for(const int64_t value: values) {
+                const int KEY_LEN = 8;
+                unsigned char key[KEY_LEN];
+                encode_int64(value, key);
+                tokens.push_back(std::string((char*)key, KEY_LEN));
             }
         }
+
+        for(auto token: tokens) {
+            const unsigned char *key;
+            int key_len;
+
+            if(name_field.second.type == field_types::STRING_ARRAY || name_field.second.type == field_types::STRING) {
+                std::transform(token.begin(), token.end(), token.begin(), ::tolower);
+                key = (const unsigned char *) token.c_str();
+                key_len = (int) (token.length() + 1);
+            } else {
+                key = (const unsigned char *) token.c_str();
+                key_len = (int) (token.length());
+            }
+
+            if(token == "https://twitter.com/yogalayout") {
+                std::cout << "token https://twitter.com/yogalayout" << std::endl;
+            }
+
+            art_leaf* leaf = (art_leaf *) art_search(search_index.at(name_field.first), key, key_len);
+            if(leaf != NULL) {
+                uint32_t seq_id_values[1] = {seq_id};
+
+                if(leaf->values->ids.getLength() == 0) {
+                    std::cout << "HEY!!!" << std::endl;
+                }
+
+                uint32_t doc_index = leaf->values->ids.indexOf(seq_id);
+                uint32_t start_offset = leaf->values->offset_index.at(doc_index);
+                uint32_t end_offset = (doc_index == leaf->values->ids.getLength() - 1) ?
+                                      leaf->values->offsets.getLength() :
+                                      leaf->values->offset_index.at(doc_index+1);
+
+                uint32_t doc_indices[1] = {doc_index};
+                remove_and_shift_offset_index(leaf->values->offset_index, doc_indices, 1);
+
+                leaf->values->offsets.remove_index(start_offset, end_offset);
+                leaf->values->ids.remove_values(seq_id_values, 1);
+
+                /*len = leaf->values->offset_index.getLength();
+                for(auto i=0; i<len; i++) {
+                    std::cout << "i: " << i << ", val: " << leaf->values->offset_index.at(i) << std::endl;
+                }
+                std::cout << "----" << std::endl;*/
+
+                if(leaf->values->ids.getLength() == 0) {
+                    art_delete(search_index.at(name_field.first), key, key_len);
+                }
+            }
+        }
+    }
+
+    // remove facets if any
+    for(auto field_facet_value: facet_index) {
+        field_facet_value.second.doc_values.erase(seq_id);
+    }
+
+    // remove sort index if any
+    for(auto field_doc_value_map: sort_index) {
+        field_doc_value_map.second->erase(seq_id);
     }
 
     store->remove(get_doc_id_key(id));