Expose token ranking field properly via the API.

This commit is contained in:
Kishore Nallan 2017-05-27 14:02:32 +05:30
parent 7531f9b13c
commit b7bc974b8e
10 changed files with 73 additions and 50 deletions

View File

@ -35,7 +35,10 @@
- ~~Filter query in the API~~
- ~~Facet limit (hardcode to top 10)~~
- ~~Deprecate old split function~~
- When prefix=true, use token_ranking_field for token ordering
- Search snippet
- ID should not have "/"
- Group results by field
- Use rocksdb batch put for atomic insertion
- Test for sorted_array::indexOf when length is 0
- Handle store-get() not finding a key

View File

@ -110,7 +110,7 @@ typedef struct {
*/
typedef struct {
art_values* values;
uint16_t max_score;
uint32_t max_score;
uint32_t key_len;
unsigned char key[];
} art_leaf;

View File

@ -60,7 +60,7 @@ private:
spp::sparse_hash_map<std::string, spp::sparse_hash_map<uint32_t, int64_t>*> sort_index;
std::string token_ordering_field;
std::string token_ranking_field;
std::string get_doc_id_key(const std::string & doc_id);
@ -110,7 +110,7 @@ public:
Collection(const std::string name, const uint32_t collection_id, const uint32_t next_seq_id, Store *store,
const std::vector<field> & search_fields, const std::vector<field> & facet_fields,
const std::vector<field> & sort_fields, const std::string token_ordering_field);
const std::vector<field> & sort_fields, const std::string token_ranking_field);
~Collection();
@ -132,7 +132,7 @@ public:
spp::sparse_hash_map<std::string, field> get_schema();
std::string get_token_ordering_field();
std::string get_token_ranking_field();
Option<std::string> add(const std::string & json_str);

View File

@ -23,7 +23,7 @@ private:
static constexpr const char* COLLECTION_SEARCH_FIELDS_KEY = "search_fields";
static constexpr const char* COLLECTION_FACET_FIELDS_KEY = "facet_fields";
static constexpr const char* COLLECTION_SORT_FIELDS_KEY = "sort_fields";
static constexpr const char* COLLECTION_TOKEN_ORDERING_FIELD_KEY = "token_ordering_field";
static constexpr const char* COLLECTION_TOKEN_ORDERING_FIELD_KEY = "token_ranking_field";
CollectionManager();
@ -43,7 +43,7 @@ public:
Collection* create_collection(std::string name, const std::vector<field> & search_fields,
const std::vector<field> & facet_fields,
const std::vector<field> & sort_fields,
const std::string & token_ordering_field = "");
const std::string & token_ranking_field = "");
Collection* get_collection(std::string collection_name);

View File

@ -101,7 +101,17 @@ void post_create_collection(http_req & req, http_res & res) {
sort_fields.push_back(field(sort_field_json["name"], sort_field_json["type"]));
}
collectionManager.create_collection(req_json["name"], search_fields, facet_fields, sort_fields);
std::string token_ranking_field = "";
if(req_json.count("token_ranking_field") != 0) {
if(!req_json["token_ranking_field"].is_string()) {
return res.send_400("Wrong format for `token_ranking_field`. It should be a string (name of a field).");
}
token_ranking_field = req_json["token_ranking_field"].get<std::string>();
}
collectionManager.create_collection(req_json["name"], search_fields, facet_fields, sort_fields, token_ranking_field);
res.send_201(req.body);
}
@ -112,7 +122,6 @@ void get_search(http_req & req, http_res & res) {
const char *SEARCH_BY = "search_by";
const char *SORT_BY = "sort_by";
const char *FACET_BY = "facet_by";
const char *TOKEN_ORDERING = "token_ordering";
if(req.params.count(NUM_TYPOS) == 0) {
req.params[NUM_TYPOS] = "2";
@ -122,18 +131,12 @@ void get_search(http_req & req, http_res & res) {
req.params[PREFIX] = "false";
}
if(req.params.count(TOKEN_ORDERING) == 0) {
req.params[TOKEN_ORDERING] = "FREQUENCY";
}
if(req.params.count(SEARCH_BY) == 0) {
return res.send_400(std::string("Parameter `") + SEARCH_BY + "` is required.");
}
std::string filter_str = req.params.count(FILTER) != 0 ? req.params[FILTER] : "";
token_ordering token_order = (req.params[TOKEN_ORDERING] == "MAX_SCORE") ? MAX_SCORE : FREQUENCY;
std::vector<std::string> search_fields;
StringUtils::split(req.params[SEARCH_BY], search_fields, ",");
@ -171,9 +174,16 @@ void get_search(http_req & req, http_res & res) {
return res.send_404();
}
bool prefix = (req.params[PREFIX] == "true");
token_ordering token_order = FREQUENCY;
if(prefix && !collection->get_token_ranking_field().empty()) {
token_order = MAX_SCORE;
}
nlohmann::json result = collection->search(req.params["q"], search_fields, filter_str, facet_fields,
sort_fields, std::stoi(req.params[NUM_TYPOS]), 100,
token_order, false);
token_order, prefix);
const std::string & json_str = result.dump();
//std::cout << "JSON:" << json_str << std::endl;
struct rusage r_usage;

View File

@ -9,9 +9,9 @@
Collection::Collection(const std::string name, const uint32_t collection_id, const uint32_t next_seq_id, Store *store,
const std::vector<field> &search_fields, const std::vector<field> & facet_fields,
const std::vector<field> & sort_fields, const std::string token_ordering_field):
const std::vector<field> & sort_fields, const std::string token_ranking_field):
name(name), collection_id(collection_id), next_seq_id(next_seq_id), store(store),
sort_fields(sort_fields), token_ordering_field(token_ordering_field) {
sort_fields(sort_fields), token_ranking_field(token_ranking_field) {
for(const field& field: search_fields) {
art_tree *t = new art_tree;
@ -74,22 +74,22 @@ Option<std::string> Collection::add(const std::string & json_str) {
}
Option<uint32_t> Collection::index_in_memory(const nlohmann::json &document, uint32_t seq_id) {
if(!token_ordering_field.empty() && document.count(token_ordering_field) == 0) {
return Option<>(400, "Field `" + token_ordering_field + "` has been declared as a token ordering field, "
if(!token_ranking_field.empty() && document.count(token_ranking_field) == 0) {
return Option<>(400, "Field `" + token_ranking_field + "` has been declared as a token ranking field, "
"but is not found in the document.");
}
if(!token_ordering_field.empty() && !document[token_ordering_field].is_number()) {
return Option<>(400, "Token ordering field `" + token_ordering_field + "` must be an INT32.");
if(!token_ranking_field.empty() && !document[token_ranking_field].is_number()) {
return Option<>(400, "Token ranking field `" + token_ranking_field + "` must be an INT32.");
}
if(!token_ordering_field.empty() && document[token_ordering_field].get<int64_t>() > INT32_MAX) {
return Option<>(400, "Token ordering field `" + token_ordering_field + "` exceeds maximum value of INT32.");
if(!token_ranking_field.empty() && document[token_ranking_field].get<int64_t>() > INT32_MAX) {
return Option<>(400, "Token ranking field `" + token_ranking_field + "` exceeds maximum value of INT32.");
}
uint32_t points = 0;
if(!token_ordering_field.empty()) {
points = document[token_ordering_field];
if(!token_ranking_field.empty()) {
points = document[token_ranking_field];
}
for(const std::pair<std::string, field> & field_pair: search_schema) {
@ -622,6 +622,7 @@ nlohmann::json Collection::search(std::string query, const std::vector<std::stri
topster.sort();
}
// order of fields specified matter: matching docs from earlier fields are more important
for(auto t = 0; t < topster.size && t < num_results; t++) {
field_order_kvs.push_back(std::make_pair(search_fields.size() - i, topster.getKV(t)));
}
@ -635,7 +636,7 @@ nlohmann::json Collection::search(std::string query, const std::vector<std::stri
if(a.second.match_score != b.second.match_score) return a.second.match_score > b.second.match_score;
if(a.second.primary_attr != b.second.primary_attr) return a.second.primary_attr > b.second.primary_attr;
if(a.second.secondary_attr != b.second.secondary_attr) return a.second.secondary_attr > b.second.secondary_attr;
if(a.first != b.first) return a.first > b.first;
if(a.first != b.first) return a.first > b.first; // field position
return a.second.key > b.second.key;
});
@ -1169,6 +1170,6 @@ std::string Collection::get_seq_id_collection_prefix() {
return std::to_string(collection_id) + "_" + std::string(SEQ_ID_PREFIX);
}
std::string Collection::get_token_ordering_field() {
return token_ordering_field;
std::string Collection::get_token_ranking_field() {
return token_ranking_field;
}

View File

@ -53,7 +53,7 @@ void CollectionManager::init(Store *store) {
collection_sort_fields.push_back({it.value()[fields::name], it.value()[fields::type]});
}
std::string token_ordering_field = collection_meta[COLLECTION_TOKEN_ORDERING_FIELD_KEY].get<std::string>();
std::string token_ranking_field = collection_meta[COLLECTION_TOKEN_ORDERING_FIELD_KEY].get<std::string>();
Collection* collection = new Collection(this_collection_name,
collection_meta[COLLECTION_ID_KEY].get<uint32_t>(),
@ -62,7 +62,7 @@ void CollectionManager::init(Store *store) {
search_fields,
facet_fields,
collection_sort_fields,
token_ordering_field);
token_ranking_field);
// Fetch records from the store and re-create memory index
std::vector<std::string> documents;
@ -88,7 +88,7 @@ void CollectionManager::init(Store *store) {
Collection* CollectionManager::create_collection(std::string name, const std::vector<field> & search_fields,
const std::vector<field> & facet_fields,
const std::vector<field> & sort_fields,
const std::string & token_ordering_field) {
const std::string & token_ranking_field) {
if(store->contains(Collection::get_meta_key(name))) {
return nullptr;
}
@ -124,10 +124,10 @@ Collection* CollectionManager::create_collection(std::string name, const std::ve
collection_meta[COLLECTION_SEARCH_FIELDS_KEY] = search_fields_json;
collection_meta[COLLECTION_FACET_FIELDS_KEY] = facet_fields_json;
collection_meta[COLLECTION_SORT_FIELDS_KEY] = sort_fields_json;
collection_meta[COLLECTION_TOKEN_ORDERING_FIELD_KEY] = token_ordering_field;
collection_meta[COLLECTION_TOKEN_ORDERING_FIELD_KEY] = token_ranking_field;
Collection* new_collection = new Collection(name, next_collection_id, 0, store, search_fields, facet_fields,
sort_fields, token_ordering_field);
sort_fields, token_ranking_field);
store->insert(Collection::get_meta_key(name), collection_meta.dump());
store->insert(Collection::get_next_seq_id_key(name), std::to_string(0));

View File

@ -31,12 +31,14 @@ int main(int argc, char* argv[]) {
};
std::vector<field> facet_fields_index = {
// field("lang", field_types::STRING),
// field("org", field_types::STRING),
// field("topics", field_types::STRING_ARRAY)
field("lang", field_types::STRING),
field("org", field_types::STRING),
field("topics", field_types::STRING_ARRAY)
};
std::vector<field> sort_fields = { field("stars", "INT32")};
std::vector<field> sort_fields = {
field("stars", "INT32")
};
Collection *collection = collectionManager.get_collection("github_top1k");
@ -45,7 +47,7 @@ int main(int argc, char* argv[]) {
}
int j = 0;
while(j < 1) {
while(j < 1000) {
j++;
std::ifstream infile(argv[1]);
@ -53,11 +55,14 @@ int main(int argc, char* argv[]) {
cout << "BEGINNING Iteration: " << j << endl << flush;
auto begin = std::chrono::high_resolution_clock::now();
int doc_id = 0;
while (std::getline(infile, json_line)) {
nlohmann::json document = nlohmann::json::parse(json_line);
//document["id"] = std::to_string(doc_id);
document["id"] = document["org"].get<std::string>() + ":" + document["repo_name"].get<std::string>();
collection->add(document.dump());
doc_id++;
}
infile.close();
@ -70,16 +75,14 @@ int main(int argc, char* argv[]) {
std::ifstream infile2(argv[1]);
int counter = 0;
doc_id = 0;
while (std::getline(infile2, json_line)) {
counter++;
nlohmann::json document = nlohmann::json::parse(json_line);
//document["id"] = std::to_string(doc_id);
document["id"] = document["org"].get<std::string>() + ":" + document["repo_name"].get<std::string>();
collection->remove(document["id"]);
/*if (counter % 100 == 0) {
std::cout << "Removed " << counter << " so far..." << std::endl;
}*/
doc_id++;
}
infile2.close();

View File

@ -76,7 +76,7 @@ TEST_F(CollectionManagerTest, RestoreRecordsOnRestart) {
ASSERT_EQ(1, collection1->get_sort_fields().size());
ASSERT_EQ(sort_fields[0].name, collection1->get_sort_fields()[0].name);
ASSERT_EQ(schema.size(), collection1->get_schema().size());
ASSERT_EQ("points", collection1->get_token_ordering_field());
ASSERT_EQ("points", collection1->get_token_ranking_field());
results = collection1->search("thomas", search_fields, "", facets, sort_fields, 0, 10, FREQUENCY, false);
ASSERT_EQ(4, results["hits"].size());

View File

@ -882,14 +882,20 @@ TEST_F(CollectionTest, IndexingWithBadData) {
ASSERT_TRUE(empty_facet_field_op.ok());
doc_str = "{\"name\": \"foo\", \"age\": \"34\", \"tags\": [], \"average\": 34 }";
const Option<std::string> & bad_token_ordering_field_op1 = sample_collection->add(doc_str);
ASSERT_FALSE(bad_token_ordering_field_op1.ok());
ASSERT_STREQ("Token ordering field `age` must be an INT32.", bad_token_ordering_field_op1.error().c_str());
const Option<std::string> & bad_token_ranking_field_op1 = sample_collection->add(doc_str);
ASSERT_FALSE(bad_token_ranking_field_op1.ok());
ASSERT_STREQ("Token ranking field `age` must be an INT32.", bad_token_ranking_field_op1.error().c_str());
doc_str = "{\"name\": \"foo\", \"age\": 343234324234233234, \"tags\": [], \"average\": 34 }";
const Option<std::string> & bad_token_ordering_field_op2 = sample_collection->add(doc_str);
ASSERT_FALSE(bad_token_ordering_field_op2.ok());
ASSERT_STREQ("Token ordering field `age` exceeds maximum value of INT32.", bad_token_ordering_field_op2.error().c_str());
const Option<std::string> & bad_token_ranking_field_op2 = sample_collection->add(doc_str);
ASSERT_FALSE(bad_token_ranking_field_op2.ok());
ASSERT_STREQ("Token ranking field `age` exceeds maximum value of INT32.", bad_token_ranking_field_op2.error().c_str());
doc_str = "{\"name\": \"foo\", \"tags\": [], \"average\": 34 }";
const Option<std::string> & bad_token_ranking_field_op3 = sample_collection->add(doc_str);
ASSERT_FALSE(bad_token_ranking_field_op3.ok());
ASSERT_STREQ("Field `age` has been declared as a token ranking field, but is not found in the document.",
bad_token_ranking_field_op3.error().c_str());
doc_str = "{\"name\": \"foo\", \"age\": 34, \"tags\": [], \"average\": \"34\"}";
const Option<std::string> & bad_rank_field_op = sample_collection->add(doc_str);