From 8271935c61a9c54f1051901d12e2b27fbc396072 Mon Sep 17 00:00:00 2001
From: Kishore Nallan <kishorenc@gmail.com>
Date: Tue, 28 Dec 2021 21:54:14 +0530
Subject: [PATCH] Fix more cyrillic highlight issues.

---
 src/collection.cpp              | 24 +++++++++++++++---------
 src/tokenizer.cpp               |  4 +++-
 test/collection_locale_test.cpp | 30 +++++++++++++++++++++++++++---
 3 files changed, 45 insertions(+), 13 deletions(-)

diff --git a/src/collection.cpp b/src/collection.cpp
index 62599bd3..ae160d04 100644
--- a/src/collection.cpp
+++ b/src/collection.cpp
@@ -1709,9 +1709,13 @@ void Collection::highlight_result(const field &search_field,
         const Match& match = match_index.match;
 
         size_t last_valid_offset = 0;
-        for (auto token_offset : match.offsets) {
+        int last_valid_offset_index = -1;
+
+        for(size_t match_offset_index = 0; match_offset_index < match.offsets.size(); match_offset_index++) {
+            const auto& token_offset = match.offsets[match_offset_index];
             if(token_offset.offset != MAX_DISPLACEMENT) {
                 last_valid_offset = token_offset.offset;
+                last_valid_offset_index = match_offset_index;
             } else {
                 break;
             }
@@ -1753,7 +1757,7 @@ void Collection::highlight_result(const field &search_field,
         // need an ordered map here to ensure that it is ordered by the key (start offset)
         std::map<size_t, size_t> token_offsets;
 
-        size_t match_offset_index = 0;
+        int match_offset_index = 0;
         std::string raw_token;
         std::set<std::string> token_hits;  // used to identify repeating tokens
         size_t raw_token_index = 0, tok_start = 0, tok_end = 0;
@@ -1788,7 +1792,7 @@ void Collection::highlight_result(const field &search_field,
 
             // ensures that the `snippet_start_offset` is always from a matched token, and not from query suggestion
             if ((found_first_match && token_already_found) ||
-                (match_offset_index < match.offsets.size() &&
+                (match_offset_index <= last_valid_offset_index &&
                  match.offsets[match_offset_index].offset == raw_token_index)) {
 
                 token_offsets.emplace(tok_start, tok_end);
@@ -1797,7 +1801,7 @@ void Collection::highlight_result(const field &search_field,
                 // to skip over duplicate tokens in the query
                 do {
                     match_offset_index++;
-                } while(match_offset_index < match.offsets.size() &&
+                } while(match_offset_index <= last_valid_offset_index &&
                         match.offsets[match_offset_index - 1].offset == match.offsets[match_offset_index].offset);
 
                 if(!found_first_match) {
@@ -1812,9 +1816,11 @@ void Collection::highlight_result(const field &search_field,
                 token_hits.insert(raw_token);
             }
 
-            if(raw_token_index == last_valid_offset + highlight_affix_num_tokens) {
+            if(raw_token_index >= last_valid_offset + highlight_affix_num_tokens) {
                 // register end of highlight snippet
-                snippet_end_offset = tok_end;
+                if(snippet_end_offset == text.size() - 1) {
+                    snippet_end_offset = tok_end;
+                }
             }
 
             // We can break early only if we have:
@@ -1823,8 +1829,8 @@ void Collection::highlight_result(const field &search_field,
             // c) raw_token_index exceeds snippet threshold
             // d) highlight fully is not requested
 
-            if(raw_token_index >= snippet_threshold - 1 &&
-               match_offset_index == match.offsets.size() &&
+            if(raw_token_index >= snippet_threshold &&
+               match_offset_index > last_valid_offset_index &&
                raw_token_index >= last_valid_offset + highlight_affix_num_tokens &&
                !highlighted_fully) {
                 break;
@@ -1835,7 +1841,7 @@ void Collection::highlight_result(const field &search_field,
             continue;
         }
 
-        if(raw_token_index + 1 < snippet_threshold) {
+        if(highlighted_fully || raw_token_index <= snippet_threshold-1) {
             // fully highlight field whose token size is less than given snippet threshold
             snippet_start_offset = 0;
             snippet_end_offset = text.size() - 1;
diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp
index c291ecf9..32bdcae6 100644
--- a/src/tokenizer.cpp
+++ b/src/tokenizer.cpp
@@ -112,7 +112,9 @@ bool Tokenizer::next(std::string &token, size_t& token_index, size_t& start_inde
             }
 
             if(!token.empty()) {
-                if (!std::isalnum(token[0]) && is_ascii_char(token[0])) {
+                if(token == " " ||  token == "," || token == "." || token == "!" || token == "?") {
+                    found_token = false;
+                } else if (!std::isalnum(token[0]) && is_ascii_char(token[0])) {
                     // ignore ascii symbols
                     found_token = false;
                     token_counter++;
diff --git a/test/collection_locale_test.cpp b/test/collection_locale_test.cpp
index ae30fb59..d7e083dc 100644
--- a/test/collection_locale_test.cpp
+++ b/test/collection_locale_test.cpp
@@ -594,15 +594,18 @@ TEST_F(CollectionLocaleTest, SearchOnCyrillicTextWithSpecialCharacters) {
 
     ASSERT_TRUE(coll1->add(doc.dump()).ok());
 
-    auto results = coll1->search("отсутствие", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}).get();
+    auto results = coll1->search("отсутствие", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true},
+                                 10, spp::sparse_hash_set<std::string>(), spp::sparse_hash_set<std::string>(),
+                                 10, "", 10).get();
 
     ASSERT_EQ(1, results["hits"].size());
-    ASSERT_EQ("скромности. Посыл, среди которых <mark>отсутствие</mark> мобильного страшное",
+    ASSERT_EQ("скромности. Посыл, среди которых <mark>отсутствие</mark> мобильного страшное.",
               results["hits"][0]["highlights"][0]["snippet"].get<std::string>());
 
     results = coll1->search("принятое", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}).get();
+
     ASSERT_EQ(1, results["hits"].size());
-    ASSERT_EQ("Сирый», «несчастный», «никчёмный» — <mark>принятое</mark> особ, сейчас, впрочем, оттенок скромности. Посыл, среди которых отсутствие мобильного страшное.",
+    ASSERT_EQ("«Сирый», «несчастный», «никчёмный» — <mark>принятое</mark> особ, сейчас, впрочем, оттенок скромности. Посыл, среди которых отсутствие мобильного страшное.",
               results["hits"][0]["highlights"][0]["snippet"].get<std::string>());
 
     results = coll1->search("*", {}, "", {"title"}, {}, {0}, 0, 1, FREQUENCY, {true}, 10,
@@ -623,3 +626,24 @@ TEST_F(CollectionLocaleTest, SearchOnCyrillicTextWithSpecialCharacters) {
 
     collectionManager.drop_collection("coll1");
 }
+
+TEST_F(CollectionLocaleTest, SearchOnCyrillicLargeText) {
+    std::vector<field> fields = {field("title", field_types::STRING, true, false, true, "ru"),};
+    Collection* coll1 = collectionManager.create_collection("coll1", 1, fields).get();
+
+    nlohmann::json doc;
+    doc["title"] = "Петр Великий, царь России, в начале 18 века санкционировал использование западных буквенных форм "
+                   "(ru). Со временем они были в значительной степени приняты на других языках, использующих этот "
+                   "сценарий. Таким образом, в отличие от большинства современных греческих шрифтов, которые сохранили "
+                   "свой собственный набор принципов дизайна для строчных букв (таких как размещение засечек, форма "
+                   "концов штриха и правила толщины штриха, хотя греческие заглавные буквы действительно используют "
+                   "латинский дизайн принципы) современные кириллические шрифты во многом такие же, как современные "
+                   "латинские шрифты того же семейства. Развитие некоторых кириллических компьютерных шрифтов из "
+                   "латинских также способствовало визуальной латинизации кириллического шрифта.";
+
+    ASSERT_TRUE(coll1->add(doc.dump()).ok());
+
+    auto results = coll1->search("Великий", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}).get();
+    ASSERT_STREQ("Петр <mark>Великий</mark>, царь России, в начале",
+                 results["hits"][0]["highlights"][0]["snippet"].get<std::string>().c_str());
+}