Parameterize the token ordering field.

This commit is contained in:
Kishore Nallan 2017-03-26 21:26:01 +05:30
parent fab27d9f5c
commit 70dda716c5
7 changed files with 60 additions and 21 deletions

View File

@ -28,7 +28,7 @@
- ~~Filters~~
- ~~Facets~~
- ~~Schema validation during insertion (missing fields + type errors)~~
- Proper score field for ranking tokens
- ~~Proper score field for ranking tokens~~
- Prevent string copy during indexing
- clean special chars before indexing
- Minimum results should be a variable instead of blindly going with max_results

View File

@ -33,6 +33,8 @@ private:
spp::sparse_hash_map<std::string, spp::sparse_hash_map<uint32_t, int64_t>*> rank_index;
std::string token_ordering_field;
std::string get_doc_id_key(std::string doc_id);
std::string get_seq_id_key(uint32_t seq_id);
@ -80,7 +82,7 @@ public:
Collection(const std::string name, const uint32_t collection_id, const uint32_t next_seq_id, Store *store,
const std::vector<field> & search_fields, const std::vector<field> & facet_fields,
const std::vector<std::string> & rank_fields);
const std::vector<std::string> & rank_fields, const std::string token_ordering_field);
~Collection();
@ -102,6 +104,8 @@ public:
spp::sparse_hash_map<std::string, field> get_schema();
std::string get_token_ordering_field();
Option<std::string> add(std::string json_str);
nlohmann::json search(std::string query, const std::vector<std::string> search_fields,

View File

@ -23,6 +23,7 @@ private:
static constexpr const char* COLLECTION_SEARCH_FIELDS_KEY = "search_fields";
static constexpr const char* COLLECTION_FACET_FIELDS_KEY = "facet_fields";
static constexpr const char* COLLECTION_RANK_FIELDS_KEY = "rank_fields";
static constexpr const char* COLLECTION_TOKEN_ORDERING_FIELD_KEY = "token_ordering_field";
CollectionManager();
@ -41,7 +42,8 @@ public:
Collection* create_collection(std::string name, const std::vector<field> & search_fields,
const std::vector<field> & facet_fields,
const std::vector<std::string> & rank_fields);
const std::vector<std::string> & rank_fields,
const std::string & token_ordering_field = "");
Collection* get_collection(std::string collection_name);

View File

@ -9,8 +9,9 @@
Collection::Collection(const std::string name, const uint32_t collection_id, const uint32_t next_seq_id, Store *store,
const std::vector<field> &search_fields, const std::vector<field> & facet_fields,
const std::vector<std::string> & rank_fields):
name(name), collection_id(collection_id), next_seq_id(next_seq_id), store(store), rank_fields(rank_fields) {
const std::vector<std::string> & rank_fields, const std::string token_ordering_field):
name(name), collection_id(collection_id), next_seq_id(next_seq_id), store(store),
rank_fields(rank_fields), token_ordering_field(token_ordering_field) {
for(const field& field: search_fields) {
art_tree *t = new art_tree;
@ -79,9 +80,22 @@ Option<std::string> Collection::add(std::string json_str) {
}
Option<uint32_t> Collection::index_in_memory(const nlohmann::json &document, uint32_t seq_id) {
if(!token_ordering_field.empty() && document.count(token_ordering_field) == 0) {
return Option<>(400, "Field `" + token_ordering_field + "` has been declared as a token ordering field, "
"but is not found in the document.");
}
if(!document[token_ordering_field].is_number()) {
return Option<>(400, "Token ordering field `" + token_ordering_field + "` must be an INT32.");
}
if(document[token_ordering_field].get<int64_t>() > INT32_MAX) {
return Option<>(400, "Token ordering field `" + token_ordering_field + "` exceeds maximum value of INT32.");
}
uint32_t points = 0;
if(document.count("points") != 0) {
points = document["points"];
if(!token_ordering_field.empty()) {
points = document[token_ordering_field];
}
for(const std::pair<std::string, field> & field_pair: search_schema) {
@ -1061,4 +1075,8 @@ std::string Collection::get_meta_key(std::string collection_name) {
std::string Collection::get_seq_id_collection_prefix() {
return std::to_string(collection_id) + "_" + std::string(SEQ_ID_PREFIX);
}
std::string Collection::get_token_ordering_field() {
return token_ordering_field;
}

View File

@ -48,13 +48,16 @@ void CollectionManager::init(Store *store) {
std::vector<std::string> collection_rank_fields =
collection_meta[COLLECTION_RANK_FIELDS_KEY].get<std::vector<std::string>>();
std::string token_ordering_field = collection_meta[COLLECTION_TOKEN_ORDERING_FIELD_KEY].get<std::string>();
Collection* collection = new Collection(this_collection_name,
collection_meta[COLLECTION_ID_KEY].get<uint32_t>(),
collection_next_seq_id,
store,
search_fields,
facet_fields,
collection_rank_fields);
collection_rank_fields,
token_ordering_field);
// Fetch records from the store and re-create memory index
std::vector<std::string> documents;
@ -79,7 +82,8 @@ void CollectionManager::init(Store *store) {
Collection* CollectionManager::create_collection(std::string name, const std::vector<field> & search_fields,
const std::vector<field> & facet_fields,
const std::vector<std::string> & rank_fields) {
const std::vector<std::string> & rank_fields,
const std::string & token_ordering_field) {
if(store->contains(Collection::get_meta_key(name))) {
return nullptr;
}
@ -107,8 +111,10 @@ Collection* CollectionManager::create_collection(std::string name, const std::ve
collection_meta[COLLECTION_SEARCH_FIELDS_KEY] = search_fields_json;
collection_meta[COLLECTION_FACET_FIELDS_KEY] = facet_fields_json;
collection_meta[COLLECTION_RANK_FIELDS_KEY] = rank_fields;
Collection* new_collection = new Collection(name, next_collection_id, 0, store, search_fields, facet_fields, rank_fields);
collection_meta[COLLECTION_TOKEN_ORDERING_FIELD_KEY] = token_ordering_field;
Collection* new_collection = new Collection(name, next_collection_id, 0, store, search_fields, facet_fields,
rank_fields, token_ordering_field);
store->insert(Collection::get_meta_key(name), collection_meta.dump());
store->insert(Collection::get_next_seq_id_key(name), std::to_string(0));

View File

@ -26,7 +26,8 @@ protected:
facet_fields = {field("starring", field_types::STRING)};
rank_fields = {"points"};
collection1 = collectionManager.create_collection("collection1", search_fields, facet_fields, rank_fields);
collection1 = collectionManager.create_collection("collection1", search_fields, facet_fields,
rank_fields, "points");
}
virtual void SetUp() {
@ -71,6 +72,7 @@ TEST_F(CollectionManagerTest, RestoreRecordsOnRestart) {
ASSERT_EQ(facet_fields_expected, collection1->get_facet_fields());
ASSERT_EQ(rank_fields, collection1->get_rank_fields());
ASSERT_EQ(schema.size(), collection1->get_schema().size());
ASSERT_EQ("points", collection1->get_token_ordering_field());
results = collection1->search("thomas", search_fields, "", facets, rank_fields, 0, 10, FREQUENCY, false);
ASSERT_EQ(4, results["hits"].size());

View File

@ -31,7 +31,8 @@ protected:
collection = collectionManager.get_collection("collection");
if(collection == nullptr) {
collection = collectionManager.create_collection("collection", search_fields, facet_fields, rank_fields);
collection = collectionManager.create_collection("collection", search_fields, facet_fields,
rank_fields, "points");
}
std::string json_line;
@ -787,23 +788,24 @@ TEST_F(CollectionTest, IndexingWithBadData) {
// should not crash when document to-be-indexed doesn't match schema
Collection *sample_collection;
std::vector<field> fields = {field("name", field_types::STRING), field("age", field_types::INT32)};
std::vector<field> fields = {field("name", field_types::STRING)};
facet_fields = {field("tags", field_types::STRING_ARRAY)};
std::vector<std::string> rank_fields = {"age", "average"};
sample_collection = collectionManager.get_collection("sample_collection");
if(sample_collection == nullptr) {
sample_collection = collectionManager.create_collection("sample_collection", fields, facet_fields, rank_fields);
sample_collection = collectionManager.create_collection("sample_collection", fields, facet_fields,
rank_fields, "age");
}
const Option<std::string> & search_fields_missing_op1 = sample_collection->add("{\"namezz\": \"foo\"}");
const Option<std::string> & search_fields_missing_op1 = sample_collection->add("{\"namezz\": \"foo\", \"age\": 29}");
ASSERT_FALSE(search_fields_missing_op1.ok());
ASSERT_STREQ("Field `name` has been declared as a search field in the schema, but is not found in the document.",
search_fields_missing_op1.error().c_str());
const Option<std::string> & search_fields_missing_op2 = sample_collection->add("{\"name\": \"foo\", \"agez\": 34}");
const Option<std::string> & search_fields_missing_op2 = sample_collection->add("{\"namez\": \"foo\", \"age\": 34}");
ASSERT_FALSE(search_fields_missing_op2.ok());
ASSERT_STREQ("Field `age` has been declared as a search field in the schema, but is not found in the document.",
ASSERT_STREQ("Field `name` has been declared as a search field in the schema, but is not found in the document.",
search_fields_missing_op2.error().c_str());
const Option<std::string> & facet_fields_missing_op1 = sample_collection->add("{\"name\": \"foo\", \"age\": 34}");
@ -830,9 +832,14 @@ TEST_F(CollectionTest, IndexingWithBadData) {
ASSERT_TRUE(empty_facet_field_op.ok());
doc_str = "{\"name\": \"foo\", \"age\": \"34\", \"tags\": [], \"average\": 34 }";
const Option<std::string> & bad_search_field_op = sample_collection->add(doc_str);
ASSERT_FALSE(bad_search_field_op.ok());
ASSERT_STREQ("Search field `age` must be an INT32.", bad_search_field_op.error().c_str());
const Option<std::string> & bad_token_ordering_field_op1 = sample_collection->add(doc_str);
ASSERT_FALSE(bad_token_ordering_field_op1.ok());
ASSERT_STREQ("Token ordering field `age` must be an INT32.", bad_token_ordering_field_op1.error().c_str());
doc_str = "{\"name\": \"foo\", \"age\": 343234324234233234, \"tags\": [], \"average\": 34 }";
const Option<std::string> & bad_token_ordering_field_op2 = sample_collection->add(doc_str);
ASSERT_FALSE(bad_token_ordering_field_op2.ok());
ASSERT_STREQ("Token ordering field `age` exceeds maximum value of INT32.", bad_token_ordering_field_op2.error().c_str());
doc_str = "{\"name\": \"foo\", \"age\": 34, \"tags\": [], \"average\": \"34\"}";
const Option<std::string> & bad_rank_field_op = sample_collection->add(doc_str);