diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index 1d868e01..745e47e8 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -96,7 +96,7 @@ bool Tokenizer::next(std::string &token, size_t& token_index, size_t& start_inde LOG(ERROR) << "Unicode error during parsing: " << errcode; } } else { - token = unicode_text.tempSubString(prev_position, length).toUTF8String(word); + token = unicode_text.toLower().tempSubString(prev_position, length).toUTF8String(word); } if(!token.empty()) { diff --git a/test/collection_specific_test.cpp b/test/collection_specific_test.cpp index 09cd6c4d..6ea594c4 100644 --- a/test/collection_specific_test.cpp +++ b/test/collection_specific_test.cpp @@ -1541,15 +1541,28 @@ TEST_F(CollectionSpecificTest, UpdateOfTwoDocsWithSameIdWithinSameBatch) { spp::sparse_hash_set(), 10, "", 30, 4, "title", 20, {}, {}, {}, 0, "", "", {}, 1000, true).get(); - LOG(INFO) << results; + collectionManager.drop_collection("coll1"); +} - ASSERT_EQ(1, results["hits"].size()); - ASSERT_EQ("0", results["hits"][0]["document"]["id"].get()); - ASSERT_TRUE(results["hits"][0]["document"].contains("last_chance")); - ASSERT_EQ(false, results["hits"][0]["document"]["last_chance"].get()); +TEST_F(CollectionSpecificTest, CyrillicText) { + // when the first document containing a token already cannot fit compact posting list - ASSERT_EQ(1, coll1->_get_index()->_get_numerical_index().at("points")->size()); - ASSERT_EQ(1, coll1->_get_index()->_get_numerical_index().at("last_chance")->size()); + std::vector fields = {field("title", field_types::STRING, false, false, true, "sr"),}; + + Collection* coll1 = collectionManager.create_collection("coll1", 1, fields).get(); + + nlohmann::json doc; + doc["title"] = "Test Тест"; + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + + doc["title"] = "TEST ТЕСТ"; + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + + auto results = coll1->search("тест", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}).get(); + + ASSERT_EQ(2, results["hits"].size()); + ASSERT_EQ("1", results["hits"][0]["document"]["id"].get()); + ASSERT_EQ("0", results["hits"][1]["document"]["id"].get()); collectionManager.drop_collection("coll1"); } @@ -1779,9 +1792,10 @@ TEST_F(CollectionSpecificTest, VerbatimMatchShouldConsiderTokensMatchedAcrossAll ASSERT_EQ("3", results["hits"][0]["document"]["id"].get()); ASSERT_EQ("2", results["hits"][1]["document"]["id"].get()); + ASSERT_EQ(2, results["hits"].size()); + collectionManager.drop_collection("coll1"); } - TEST_F(CollectionSpecificTest, CustomNumTyposConfiguration) { // dropped tokens on a single field cannot be deemed as verbatim match