From 60cc05fe5202f76b2deee74bafb10ed039eabfb6 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Fri, 10 Feb 2017 10:47:33 +0530 Subject: [PATCH] Fixed a bug in bulk indexOf forarray search. --- src/collection.cpp | 6 +- src/collection_manager.cpp | 2 + src/sorted_array.cpp | 36 ++- test/ids.txt | 562 +++++++++++++++++++++++++++++++++++++ test/sorted_array_test.cpp | 40 +++ 5 files changed, 632 insertions(+), 14 deletions(-) create mode 100644 test/ids.txt diff --git a/src/collection.cpp b/src/collection.cpp index a555ec5c..8c2bb041 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -472,7 +472,11 @@ void Collection::score_results(Topster<100> &topster, const int & token_rank, // for each token in the query, find the positions that it appears in this document for (art_leaf *token_leaf : query_suggestion) { std::vector positions; - uint32_t doc_index = leaf_to_indices.at(token_leaf)[i]; + int doc_index = leaf_to_indices.at(token_leaf)[i]; + if(doc_index == token_leaf->values->ids.getLength()) { + continue; + } + uint32_t start_offset = token_leaf->values->offset_index.at(doc_index); uint32_t end_offset = (doc_index == token_leaf->values->ids.getLength() - 1) ? token_leaf->values->offsets.getLength() : diff --git a/src/collection_manager.cpp b/src/collection_manager.cpp index 03b24682..475b81b4 100644 --- a/src/collection_manager.cpp +++ b/src/collection_manager.cpp @@ -65,6 +65,8 @@ void CollectionManager::init(Store *store) { collections.emplace(Collection::get_meta_key(this_collection_name), collection); } + + std::cout << "Finished restoring all collections from disk." << std::endl; } Collection* CollectionManager::create_collection(std::string name, const std::vector & search_fields, diff --git a/src/sorted_array.cpp b/src/sorted_array.cpp index 21f5792d..e38b2a69 100644 --- a/src/sorted_array.cpp +++ b/src/sorted_array.cpp @@ -99,16 +99,14 @@ void sorted_array::binary_search_indices(const uint32_t *values, int low_vindex, uint32_t in_index = lower_bound_search_bits(in+METADATA_OVERHEAD, low_index, high_index, base, bits, values[pivot_vindex], &actual_value); - //if(actual_value == values[pivot_vindex]) { + if(actual_value == values[pivot_vindex]) { indices[pivot_vindex] = in_index; - //} + } else { + indices[pivot_vindex] = length; + } - size_t pivot_index = (low_index + high_index) / 2; - - binary_search_indices(values, low_vindex, pivot_vindex-1, low_index, pivot_index-1, - base, bits, indices); - binary_search_indices(values, pivot_vindex+1, high_vindex, pivot_index+1, high_index, - base, bits, indices); + binary_search_indices(values, low_vindex, pivot_vindex-1, low_index, in_index-1, base, bits, indices); + binary_search_indices(values, pivot_vindex+1, high_vindex, in_index+1, high_index, base, bits, indices); } } @@ -123,15 +121,27 @@ void sorted_array::indexOf(const uint32_t *values, const size_t values_len, uint uint32_t low_index, high_index; uint32_t actual_value = 0; + int head = -1; do { - low_index = lower_bound_search_bits(in+METADATA_OVERHEAD, 0, length-1, base, bits, values[0], &actual_value); - } while(actual_value != values[0]); + head++; + low_index = lower_bound_search_bits(in+METADATA_OVERHEAD, 0, length-1, base, bits, values[head], &actual_value); + } while(actual_value != values[head]); + int tail = values_len; do { - high_index = lower_bound_search_bits(in+METADATA_OVERHEAD, 0, length-1, base, bits, values[values_len-1], &actual_value); - } while(actual_value != values[values_len-1]); + tail--; + high_index = lower_bound_search_bits(in+METADATA_OVERHEAD, 0, length-1, base, bits, values[tail], &actual_value); + } while(actual_value != values[tail]); - binary_search_indices(values, 0, values_len-1, low_index, high_index, base, bits, indices); + for(int i = 0; i < head; i++) { + indices[i] = length; + } + + for(int j = values_len-1; j > tail; j--) { + indices[j] = length; + } + + binary_search_indices(values, head, tail, low_index, high_index, base, bits, indices); } void sorted_array::remove_values(uint32_t *sorted_values, uint32_t values_length) { diff --git a/test/ids.txt b/test/ids.txt new file mode 100644 index 00000000..4b0b7f09 --- /dev/null +++ b/test/ids.txt @@ -0,0 +1,562 @@ +2643 +4294 +5039 +9730 +11868 +17336 +17879 +20328 +20709 +21392 +22442 +23816 +26357 +32250 +34900 +37230 +43781 +44497 +51609 +53404 +65463 +66327 +67237 +71383 +72371 +72566 +76107 +76427 +80658 +81043 +81948 +83356 +83415 +88530 +89491 +89856 +90174 +100536 +101264 +102169 +102641 +104683 +107359 +108915 +109462 +110968 +119359 +124890 +125480 +125770 +126092 +126270 +129548 +130819 +131028 +135958 +141856 +145108 +145735 +148243 +151750 +153196 +155463 +159947 +166836 +167018 +167852 +168898 +169222 +169986 +171154 +175735 +175809 +177563 +180785 +189099 +191442 +193867 +194695 +200076 +204320 +204412 +208937 +211246 +211391 +211578 +211789 +212668 +213330 +216227 +216433 +216578 +217778 +220871 +220876 +222316 +229817 +232627 +232822 +232945 +235475 +240144 +245650 +250175 +252818 +255135 +255204 +255901 +258948 +264362 +270421 +271290 +271360 +272653 +275000 +278245 +280019 +284096 +286910 +288574 +291876 +292791 +301124 +301904 +303310 +303992 +305184 +305226 +305305 +320489 +329773 +330396 +335193 +337626 +341375 +341400 +341479 +342932 +350472 +352757 +357159 +358001 +360138 +361241 +361424 +361731 +361907 +361986 +362182 +362352 +362446 +362915 +363395 +371588 +372556 +373790 +374172 +376171 +377867 +380249 +380547 +381489 +387306 +394498 +394669 +395908 +403802 +405735 +406654 +407749 +411313 +412020 +413513 +415506 +416659 +421721 +430552 +431004 +431728 +432802 +435046 +435487 +436423 +436619 +438680 +439057 +443104 +445251 +446975 +447300 +447301 +447475 +447484 +450607 +451981 +452814 +457189 +457279 +461135 +463214 +463270 +470659 +473112 +476094 +477038 +477159 +477360 +477373 +477446 +477772 +478820 +480338 +482122 +482276 +483307 +483399 +484612 +487527 +489523 +494574 +494854 +497082 +499474 +501102 +517955 +521304 +521816 +522424 +524887 +526650 +530299 +539367 +541649 +541993 +542642 +545236 +545795 +548999 +549915 +557687 +557933 +562022 +572272 +572808 +576168 +577846 +578716 +588773 +592005 +592990 +594170 +595575 +600537 +602172 +603001 +603784 +604291 +604406 +605028 +605227 +606541 +607980 +608085 +619129 +619379 +628153 +629283 +629383 +632876 +633176 +633345 +633520 +633591 +636002 +637472 +640551 +641962 +642329 +645814 +647370 +648778 +653523 +655344 +656498 +656531 +659421 +660515 +666410 +673194 +674587 +679811 +682007 +683330 +683829 +689117 +693077 +703836 +704775 +704897 +708116 +709106 +709449 +712200 +714731 +716848 +718261 +721202 +724727 +730271 +732733 +734363 +734394 +734951 +738616 +739942 +742159 +742716 +743327 +755175 +756971 +757516 +759044 +764515 +765977 +767374 +770014 +774744 +774769 +774841 +777257 +777402 +778395 +779951 +789697 +790347 +791710 +798241 +803700 +804614 +807096 +807981 +808356 +809492 +819651 +822139 +829247 +830403 +830794 +836087 +837663 +840204 +853695 +861693 +862362 +864379 +867655 +868958 +870374 +872885 +873552 +873892 +877700 +884280 +886403 +887987 +892202 +892915 +894974 +895417 +895709 +895840 +896114 +896366 +902231 +902342 +910794 +914263 +916380 +917313 +923594 +926776 +933533 +936195 +937159 +940717 +941263 +943542 +956687 +959380 +959718 +966450 +966547 +966691 +966833 +967148 +967333 +984506 +985786 +987060 +990145 +991341 +993858 +997743 +1001709 +1002638 +1003071 +1003152 +1003899 +1006187 +1018038 +1019224 +1020154 +1022082 +1025534 +1027809 +1034172 +1034513 +1035690 +1036886 +1038941 +1039181 +1039226 +1041282 +1041880 +1042694 +1044957 +1046465 +1051417 +1051471 +1053143 +1053815 +1053840 +1053996 +1054074 +1059058 +1062782 +1065822 +1066555 +1068736 +1070621 +1072357 +1079188 +1084175 +1088441 +1088977 +1095973 +1096088 +1100237 +1101557 +1111000 +1111537 +1115936 +1115941 +1117837 +1119343 +1119483 +1120470 +1122853 +1123856 +1125220 +1129099 +1131872 +1133124 +1133539 +1133760 +1133923 +1134038 +1134437 +1134643 +1135468 +1136174 +1137190 +1141482 +1141484 +1145943 +1147611 +1148358 +1148577 +1148662 +1149264 +1151854 +1159053 +1162221 +1166426 +1168198 +1168801 +1169056 +1172736 +1177852 +1179556 +1180702 +1181709 +1182022 +1183075 +1183359 +1183822 +1186749 +1188946 +1190324 +1193534 +1193749 +1198459 +1200369 +1202206 +1203516 +1204727 +1205037 +1207960 +1209069 +1214248 +1218793 +1219422 +1220090 +1220312 +1221486 +1229083 +1230643 +1231187 +1238085 +1240292 +1241016 +1243925 +1245561 +1245762 +1246943 +1247773 +1248066 +1248144 +1248280 +1251758 +1252945 +1256479 +1264933 +1267166 +1268875 +1269339 +1269995 +1270198 +1271225 +1271546 +1271579 +1271718 +1271765 +1271810 +1272191 +1272310 +1272987 +1274427 +1275428 +1275483 +1280527 +1282608 +1295262 +1295375 +1298714 +1298724 +1304899 +1309025 +1311936 +1317142 +1317154 +1317857 \ No newline at end of file diff --git a/test/sorted_array_test.cpp b/test/sorted_array_test.cpp index ecc0a8c1..ea5c8d62 100644 --- a/test/sorted_array_test.cpp +++ b/test/sorted_array_test.cpp @@ -1,6 +1,8 @@ #include #include "sorted_array.h" #include +#include +#include "string_utils.h" TEST(SortedArrayTest, Append) { sorted_array arr; @@ -173,4 +175,42 @@ TEST(SortedArrayTest, MergeShouldRemoveDuplicatesAtBoundary) { } delete[] results; +} + +TEST(SortedArrayTest, BulkIndexOf) { + std::ifstream infile(std::string(ROOT_DIR)+"test/ids.txt"); + + sorted_array ids; + + std::string line; + while (std::getline(infile, line)) { + ids.append(std::stoi(line)); + } + infile.close(); + + std::vector search_ids = { 17879, 37230, 412020, 445251, 447484, 501102, 640551, 656498, 656531, + 770014, 877700, 1034172, 1115941, 1129099, 1159053, 1221486, 1295375 }; + + uint32_t *results = new uint32_t[search_ids.size()]; + ids.indexOf(&search_ids[0], search_ids.size(), results); + + for(auto i = 0; i < search_ids.size(); i++) { + auto search_id = search_ids.at(i); + ASSERT_EQ(ids.indexOf(search_id), results[i]); + } + + // when some IDs are not to be found + + search_ids.clear(); + search_ids = { 7879, 37230, 422020, 445251, 457484, 501102, 630551}; + + delete [] results; + results = new uint32_t[search_ids.size()]; + + ids.indexOf(&search_ids[0], search_ids.size(), results); + + for(auto i = 0; i < search_ids.size(); i++) { + auto search_id = search_ids.at(i); + ASSERT_EQ(ids.indexOf(search_id), results[i]); + } } \ No newline at end of file