Fixed a bug in bulk indexOf forarray search.

This commit is contained in:
Kishore Nallan 2017-02-10 10:47:33 +05:30
parent 645355067a
commit 60cc05fe52
5 changed files with 632 additions and 14 deletions

View File

@ -472,7 +472,11 @@ void Collection::score_results(Topster<100> &topster, const int & token_rank,
// for each token in the query, find the positions that it appears in this document
for (art_leaf *token_leaf : query_suggestion) {
std::vector<uint16_t> positions;
uint32_t doc_index = leaf_to_indices.at(token_leaf)[i];
int doc_index = leaf_to_indices.at(token_leaf)[i];
if(doc_index == token_leaf->values->ids.getLength()) {
continue;
}
uint32_t start_offset = token_leaf->values->offset_index.at(doc_index);
uint32_t end_offset = (doc_index == token_leaf->values->ids.getLength() - 1) ?
token_leaf->values->offsets.getLength() :

View File

@ -65,6 +65,8 @@ void CollectionManager::init(Store *store) {
collections.emplace(Collection::get_meta_key(this_collection_name), collection);
}
std::cout << "Finished restoring all collections from disk." << std::endl;
}
Collection* CollectionManager::create_collection(std::string name, const std::vector<field> & search_fields,

View File

@ -99,16 +99,14 @@ void sorted_array::binary_search_indices(const uint32_t *values, int low_vindex,
uint32_t in_index = lower_bound_search_bits(in+METADATA_OVERHEAD, low_index, high_index, base, bits,
values[pivot_vindex], &actual_value);
//if(actual_value == values[pivot_vindex]) {
if(actual_value == values[pivot_vindex]) {
indices[pivot_vindex] = in_index;
//}
} else {
indices[pivot_vindex] = length;
}
size_t pivot_index = (low_index + high_index) / 2;
binary_search_indices(values, low_vindex, pivot_vindex-1, low_index, pivot_index-1,
base, bits, indices);
binary_search_indices(values, pivot_vindex+1, high_vindex, pivot_index+1, high_index,
base, bits, indices);
binary_search_indices(values, low_vindex, pivot_vindex-1, low_index, in_index-1, base, bits, indices);
binary_search_indices(values, pivot_vindex+1, high_vindex, in_index+1, high_index, base, bits, indices);
}
}
@ -123,15 +121,27 @@ void sorted_array::indexOf(const uint32_t *values, const size_t values_len, uint
uint32_t low_index, high_index;
uint32_t actual_value = 0;
int head = -1;
do {
low_index = lower_bound_search_bits(in+METADATA_OVERHEAD, 0, length-1, base, bits, values[0], &actual_value);
} while(actual_value != values[0]);
head++;
low_index = lower_bound_search_bits(in+METADATA_OVERHEAD, 0, length-1, base, bits, values[head], &actual_value);
} while(actual_value != values[head]);
int tail = values_len;
do {
high_index = lower_bound_search_bits(in+METADATA_OVERHEAD, 0, length-1, base, bits, values[values_len-1], &actual_value);
} while(actual_value != values[values_len-1]);
tail--;
high_index = lower_bound_search_bits(in+METADATA_OVERHEAD, 0, length-1, base, bits, values[tail], &actual_value);
} while(actual_value != values[tail]);
binary_search_indices(values, 0, values_len-1, low_index, high_index, base, bits, indices);
for(int i = 0; i < head; i++) {
indices[i] = length;
}
for(int j = values_len-1; j > tail; j--) {
indices[j] = length;
}
binary_search_indices(values, head, tail, low_index, high_index, base, bits, indices);
}
void sorted_array::remove_values(uint32_t *sorted_values, uint32_t values_length) {

562
test/ids.txt Normal file
View File

@ -0,0 +1,562 @@
2643
4294
5039
9730
11868
17336
17879
20328
20709
21392
22442
23816
26357
32250
34900
37230
43781
44497
51609
53404
65463
66327
67237
71383
72371
72566
76107
76427
80658
81043
81948
83356
83415
88530
89491
89856
90174
100536
101264
102169
102641
104683
107359
108915
109462
110968
119359
124890
125480
125770
126092
126270
129548
130819
131028
135958
141856
145108
145735
148243
151750
153196
155463
159947
166836
167018
167852
168898
169222
169986
171154
175735
175809
177563
180785
189099
191442
193867
194695
200076
204320
204412
208937
211246
211391
211578
211789
212668
213330
216227
216433
216578
217778
220871
220876
222316
229817
232627
232822
232945
235475
240144
245650
250175
252818
255135
255204
255901
258948
264362
270421
271290
271360
272653
275000
278245
280019
284096
286910
288574
291876
292791
301124
301904
303310
303992
305184
305226
305305
320489
329773
330396
335193
337626
341375
341400
341479
342932
350472
352757
357159
358001
360138
361241
361424
361731
361907
361986
362182
362352
362446
362915
363395
371588
372556
373790
374172
376171
377867
380249
380547
381489
387306
394498
394669
395908
403802
405735
406654
407749
411313
412020
413513
415506
416659
421721
430552
431004
431728
432802
435046
435487
436423
436619
438680
439057
443104
445251
446975
447300
447301
447475
447484
450607
451981
452814
457189
457279
461135
463214
463270
470659
473112
476094
477038
477159
477360
477373
477446
477772
478820
480338
482122
482276
483307
483399
484612
487527
489523
494574
494854
497082
499474
501102
517955
521304
521816
522424
524887
526650
530299
539367
541649
541993
542642
545236
545795
548999
549915
557687
557933
562022
572272
572808
576168
577846
578716
588773
592005
592990
594170
595575
600537
602172
603001
603784
604291
604406
605028
605227
606541
607980
608085
619129
619379
628153
629283
629383
632876
633176
633345
633520
633591
636002
637472
640551
641962
642329
645814
647370
648778
653523
655344
656498
656531
659421
660515
666410
673194
674587
679811
682007
683330
683829
689117
693077
703836
704775
704897
708116
709106
709449
712200
714731
716848
718261
721202
724727
730271
732733
734363
734394
734951
738616
739942
742159
742716
743327
755175
756971
757516
759044
764515
765977
767374
770014
774744
774769
774841
777257
777402
778395
779951
789697
790347
791710
798241
803700
804614
807096
807981
808356
809492
819651
822139
829247
830403
830794
836087
837663
840204
853695
861693
862362
864379
867655
868958
870374
872885
873552
873892
877700
884280
886403
887987
892202
892915
894974
895417
895709
895840
896114
896366
902231
902342
910794
914263
916380
917313
923594
926776
933533
936195
937159
940717
941263
943542
956687
959380
959718
966450
966547
966691
966833
967148
967333
984506
985786
987060
990145
991341
993858
997743
1001709
1002638
1003071
1003152
1003899
1006187
1018038
1019224
1020154
1022082
1025534
1027809
1034172
1034513
1035690
1036886
1038941
1039181
1039226
1041282
1041880
1042694
1044957
1046465
1051417
1051471
1053143
1053815
1053840
1053996
1054074
1059058
1062782
1065822
1066555
1068736
1070621
1072357
1079188
1084175
1088441
1088977
1095973
1096088
1100237
1101557
1111000
1111537
1115936
1115941
1117837
1119343
1119483
1120470
1122853
1123856
1125220
1129099
1131872
1133124
1133539
1133760
1133923
1134038
1134437
1134643
1135468
1136174
1137190
1141482
1141484
1145943
1147611
1148358
1148577
1148662
1149264
1151854
1159053
1162221
1166426
1168198
1168801
1169056
1172736
1177852
1179556
1180702
1181709
1182022
1183075
1183359
1183822
1186749
1188946
1190324
1193534
1193749
1198459
1200369
1202206
1203516
1204727
1205037
1207960
1209069
1214248
1218793
1219422
1220090
1220312
1221486
1229083
1230643
1231187
1238085
1240292
1241016
1243925
1245561
1245762
1246943
1247773
1248066
1248144
1248280
1251758
1252945
1256479
1264933
1267166
1268875
1269339
1269995
1270198
1271225
1271546
1271579
1271718
1271765
1271810
1272191
1272310
1272987
1274427
1275428
1275483
1280527
1282608
1295262
1295375
1298714
1298724
1304899
1309025
1311936
1317142
1317154
1317857

View File

@ -1,6 +1,8 @@
#include <gtest/gtest.h>
#include "sorted_array.h"
#include <vector>
#include <fstream>
#include "string_utils.h"
TEST(SortedArrayTest, Append) {
sorted_array arr;
@ -173,4 +175,42 @@ TEST(SortedArrayTest, MergeShouldRemoveDuplicatesAtBoundary) {
}
delete[] results;
}
TEST(SortedArrayTest, BulkIndexOf) {
std::ifstream infile(std::string(ROOT_DIR)+"test/ids.txt");
sorted_array ids;
std::string line;
while (std::getline(infile, line)) {
ids.append(std::stoi(line));
}
infile.close();
std::vector<uint32_t> search_ids = { 17879, 37230, 412020, 445251, 447484, 501102, 640551, 656498, 656531,
770014, 877700, 1034172, 1115941, 1129099, 1159053, 1221486, 1295375 };
uint32_t *results = new uint32_t[search_ids.size()];
ids.indexOf(&search_ids[0], search_ids.size(), results);
for(auto i = 0; i < search_ids.size(); i++) {
auto search_id = search_ids.at(i);
ASSERT_EQ(ids.indexOf(search_id), results[i]);
}
// when some IDs are not to be found
search_ids.clear();
search_ids = { 7879, 37230, 422020, 445251, 457484, 501102, 630551};
delete [] results;
results = new uint32_t[search_ids.size()];
ids.indexOf(&search_ids[0], search_ids.size(), results);
for(auto i = 0; i < search_ids.size(); i++) {
auto search_id = search_ids.at(i);
ASSERT_EQ(ids.indexOf(search_id), results[i]);
}
}