mirror of
https://github.com/typesense/typesense.git
synced 2025-05-19 13:12:22 +08:00
Expose token ranking field properly via the API.
This commit is contained in:
parent
7531f9b13c
commit
b7bc974b8e
3
TODO.md
3
TODO.md
@ -35,7 +35,10 @@
|
||||
- ~~Filter query in the API~~
|
||||
- ~~Facet limit (hardcode to top 10)~~
|
||||
- ~~Deprecate old split function~~
|
||||
- When prefix=true, use token_ranking_field for token ordering
|
||||
- Search snippet
|
||||
- ID should not have "/"
|
||||
- Group results by field
|
||||
- Use rocksdb batch put for atomic insertion
|
||||
- Test for sorted_array::indexOf when length is 0
|
||||
- Handle store-get() not finding a key
|
||||
|
@ -110,7 +110,7 @@ typedef struct {
|
||||
*/
|
||||
typedef struct {
|
||||
art_values* values;
|
||||
uint16_t max_score;
|
||||
uint32_t max_score;
|
||||
uint32_t key_len;
|
||||
unsigned char key[];
|
||||
} art_leaf;
|
||||
|
@ -60,7 +60,7 @@ private:
|
||||
|
||||
spp::sparse_hash_map<std::string, spp::sparse_hash_map<uint32_t, int64_t>*> sort_index;
|
||||
|
||||
std::string token_ordering_field;
|
||||
std::string token_ranking_field;
|
||||
|
||||
std::string get_doc_id_key(const std::string & doc_id);
|
||||
|
||||
@ -110,7 +110,7 @@ public:
|
||||
|
||||
Collection(const std::string name, const uint32_t collection_id, const uint32_t next_seq_id, Store *store,
|
||||
const std::vector<field> & search_fields, const std::vector<field> & facet_fields,
|
||||
const std::vector<field> & sort_fields, const std::string token_ordering_field);
|
||||
const std::vector<field> & sort_fields, const std::string token_ranking_field);
|
||||
|
||||
~Collection();
|
||||
|
||||
@ -132,7 +132,7 @@ public:
|
||||
|
||||
spp::sparse_hash_map<std::string, field> get_schema();
|
||||
|
||||
std::string get_token_ordering_field();
|
||||
std::string get_token_ranking_field();
|
||||
|
||||
Option<std::string> add(const std::string & json_str);
|
||||
|
||||
|
@ -23,7 +23,7 @@ private:
|
||||
static constexpr const char* COLLECTION_SEARCH_FIELDS_KEY = "search_fields";
|
||||
static constexpr const char* COLLECTION_FACET_FIELDS_KEY = "facet_fields";
|
||||
static constexpr const char* COLLECTION_SORT_FIELDS_KEY = "sort_fields";
|
||||
static constexpr const char* COLLECTION_TOKEN_ORDERING_FIELD_KEY = "token_ordering_field";
|
||||
static constexpr const char* COLLECTION_TOKEN_ORDERING_FIELD_KEY = "token_ranking_field";
|
||||
|
||||
CollectionManager();
|
||||
|
||||
@ -43,7 +43,7 @@ public:
|
||||
Collection* create_collection(std::string name, const std::vector<field> & search_fields,
|
||||
const std::vector<field> & facet_fields,
|
||||
const std::vector<field> & sort_fields,
|
||||
const std::string & token_ordering_field = "");
|
||||
const std::string & token_ranking_field = "");
|
||||
|
||||
Collection* get_collection(std::string collection_name);
|
||||
|
||||
|
28
src/api.cpp
28
src/api.cpp
@ -101,7 +101,17 @@ void post_create_collection(http_req & req, http_res & res) {
|
||||
sort_fields.push_back(field(sort_field_json["name"], sort_field_json["type"]));
|
||||
}
|
||||
|
||||
collectionManager.create_collection(req_json["name"], search_fields, facet_fields, sort_fields);
|
||||
std::string token_ranking_field = "";
|
||||
|
||||
if(req_json.count("token_ranking_field") != 0) {
|
||||
if(!req_json["token_ranking_field"].is_string()) {
|
||||
return res.send_400("Wrong format for `token_ranking_field`. It should be a string (name of a field).");
|
||||
}
|
||||
|
||||
token_ranking_field = req_json["token_ranking_field"].get<std::string>();
|
||||
}
|
||||
|
||||
collectionManager.create_collection(req_json["name"], search_fields, facet_fields, sort_fields, token_ranking_field);
|
||||
res.send_201(req.body);
|
||||
}
|
||||
|
||||
@ -112,7 +122,6 @@ void get_search(http_req & req, http_res & res) {
|
||||
const char *SEARCH_BY = "search_by";
|
||||
const char *SORT_BY = "sort_by";
|
||||
const char *FACET_BY = "facet_by";
|
||||
const char *TOKEN_ORDERING = "token_ordering";
|
||||
|
||||
if(req.params.count(NUM_TYPOS) == 0) {
|
||||
req.params[NUM_TYPOS] = "2";
|
||||
@ -122,18 +131,12 @@ void get_search(http_req & req, http_res & res) {
|
||||
req.params[PREFIX] = "false";
|
||||
}
|
||||
|
||||
if(req.params.count(TOKEN_ORDERING) == 0) {
|
||||
req.params[TOKEN_ORDERING] = "FREQUENCY";
|
||||
}
|
||||
|
||||
if(req.params.count(SEARCH_BY) == 0) {
|
||||
return res.send_400(std::string("Parameter `") + SEARCH_BY + "` is required.");
|
||||
}
|
||||
|
||||
std::string filter_str = req.params.count(FILTER) != 0 ? req.params[FILTER] : "";
|
||||
|
||||
token_ordering token_order = (req.params[TOKEN_ORDERING] == "MAX_SCORE") ? MAX_SCORE : FREQUENCY;
|
||||
|
||||
std::vector<std::string> search_fields;
|
||||
StringUtils::split(req.params[SEARCH_BY], search_fields, ",");
|
||||
|
||||
@ -171,9 +174,16 @@ void get_search(http_req & req, http_res & res) {
|
||||
return res.send_404();
|
||||
}
|
||||
|
||||
bool prefix = (req.params[PREFIX] == "true");
|
||||
|
||||
token_ordering token_order = FREQUENCY;
|
||||
if(prefix && !collection->get_token_ranking_field().empty()) {
|
||||
token_order = MAX_SCORE;
|
||||
}
|
||||
|
||||
nlohmann::json result = collection->search(req.params["q"], search_fields, filter_str, facet_fields,
|
||||
sort_fields, std::stoi(req.params[NUM_TYPOS]), 100,
|
||||
token_order, false);
|
||||
token_order, prefix);
|
||||
const std::string & json_str = result.dump();
|
||||
//std::cout << "JSON:" << json_str << std::endl;
|
||||
struct rusage r_usage;
|
||||
|
@ -9,9 +9,9 @@
|
||||
|
||||
Collection::Collection(const std::string name, const uint32_t collection_id, const uint32_t next_seq_id, Store *store,
|
||||
const std::vector<field> &search_fields, const std::vector<field> & facet_fields,
|
||||
const std::vector<field> & sort_fields, const std::string token_ordering_field):
|
||||
const std::vector<field> & sort_fields, const std::string token_ranking_field):
|
||||
name(name), collection_id(collection_id), next_seq_id(next_seq_id), store(store),
|
||||
sort_fields(sort_fields), token_ordering_field(token_ordering_field) {
|
||||
sort_fields(sort_fields), token_ranking_field(token_ranking_field) {
|
||||
|
||||
for(const field& field: search_fields) {
|
||||
art_tree *t = new art_tree;
|
||||
@ -74,22 +74,22 @@ Option<std::string> Collection::add(const std::string & json_str) {
|
||||
}
|
||||
|
||||
Option<uint32_t> Collection::index_in_memory(const nlohmann::json &document, uint32_t seq_id) {
|
||||
if(!token_ordering_field.empty() && document.count(token_ordering_field) == 0) {
|
||||
return Option<>(400, "Field `" + token_ordering_field + "` has been declared as a token ordering field, "
|
||||
if(!token_ranking_field.empty() && document.count(token_ranking_field) == 0) {
|
||||
return Option<>(400, "Field `" + token_ranking_field + "` has been declared as a token ranking field, "
|
||||
"but is not found in the document.");
|
||||
}
|
||||
|
||||
if(!token_ordering_field.empty() && !document[token_ordering_field].is_number()) {
|
||||
return Option<>(400, "Token ordering field `" + token_ordering_field + "` must be an INT32.");
|
||||
if(!token_ranking_field.empty() && !document[token_ranking_field].is_number()) {
|
||||
return Option<>(400, "Token ranking field `" + token_ranking_field + "` must be an INT32.");
|
||||
}
|
||||
|
||||
if(!token_ordering_field.empty() && document[token_ordering_field].get<int64_t>() > INT32_MAX) {
|
||||
return Option<>(400, "Token ordering field `" + token_ordering_field + "` exceeds maximum value of INT32.");
|
||||
if(!token_ranking_field.empty() && document[token_ranking_field].get<int64_t>() > INT32_MAX) {
|
||||
return Option<>(400, "Token ranking field `" + token_ranking_field + "` exceeds maximum value of INT32.");
|
||||
}
|
||||
|
||||
uint32_t points = 0;
|
||||
if(!token_ordering_field.empty()) {
|
||||
points = document[token_ordering_field];
|
||||
if(!token_ranking_field.empty()) {
|
||||
points = document[token_ranking_field];
|
||||
}
|
||||
|
||||
for(const std::pair<std::string, field> & field_pair: search_schema) {
|
||||
@ -622,6 +622,7 @@ nlohmann::json Collection::search(std::string query, const std::vector<std::stri
|
||||
topster.sort();
|
||||
}
|
||||
|
||||
// order of fields specified matter: matching docs from earlier fields are more important
|
||||
for(auto t = 0; t < topster.size && t < num_results; t++) {
|
||||
field_order_kvs.push_back(std::make_pair(search_fields.size() - i, topster.getKV(t)));
|
||||
}
|
||||
@ -635,7 +636,7 @@ nlohmann::json Collection::search(std::string query, const std::vector<std::stri
|
||||
if(a.second.match_score != b.second.match_score) return a.second.match_score > b.second.match_score;
|
||||
if(a.second.primary_attr != b.second.primary_attr) return a.second.primary_attr > b.second.primary_attr;
|
||||
if(a.second.secondary_attr != b.second.secondary_attr) return a.second.secondary_attr > b.second.secondary_attr;
|
||||
if(a.first != b.first) return a.first > b.first;
|
||||
if(a.first != b.first) return a.first > b.first; // field position
|
||||
return a.second.key > b.second.key;
|
||||
});
|
||||
|
||||
@ -1169,6 +1170,6 @@ std::string Collection::get_seq_id_collection_prefix() {
|
||||
return std::to_string(collection_id) + "_" + std::string(SEQ_ID_PREFIX);
|
||||
}
|
||||
|
||||
std::string Collection::get_token_ordering_field() {
|
||||
return token_ordering_field;
|
||||
std::string Collection::get_token_ranking_field() {
|
||||
return token_ranking_field;
|
||||
}
|
@ -53,7 +53,7 @@ void CollectionManager::init(Store *store) {
|
||||
collection_sort_fields.push_back({it.value()[fields::name], it.value()[fields::type]});
|
||||
}
|
||||
|
||||
std::string token_ordering_field = collection_meta[COLLECTION_TOKEN_ORDERING_FIELD_KEY].get<std::string>();
|
||||
std::string token_ranking_field = collection_meta[COLLECTION_TOKEN_ORDERING_FIELD_KEY].get<std::string>();
|
||||
|
||||
Collection* collection = new Collection(this_collection_name,
|
||||
collection_meta[COLLECTION_ID_KEY].get<uint32_t>(),
|
||||
@ -62,7 +62,7 @@ void CollectionManager::init(Store *store) {
|
||||
search_fields,
|
||||
facet_fields,
|
||||
collection_sort_fields,
|
||||
token_ordering_field);
|
||||
token_ranking_field);
|
||||
|
||||
// Fetch records from the store and re-create memory index
|
||||
std::vector<std::string> documents;
|
||||
@ -88,7 +88,7 @@ void CollectionManager::init(Store *store) {
|
||||
Collection* CollectionManager::create_collection(std::string name, const std::vector<field> & search_fields,
|
||||
const std::vector<field> & facet_fields,
|
||||
const std::vector<field> & sort_fields,
|
||||
const std::string & token_ordering_field) {
|
||||
const std::string & token_ranking_field) {
|
||||
if(store->contains(Collection::get_meta_key(name))) {
|
||||
return nullptr;
|
||||
}
|
||||
@ -124,10 +124,10 @@ Collection* CollectionManager::create_collection(std::string name, const std::ve
|
||||
collection_meta[COLLECTION_SEARCH_FIELDS_KEY] = search_fields_json;
|
||||
collection_meta[COLLECTION_FACET_FIELDS_KEY] = facet_fields_json;
|
||||
collection_meta[COLLECTION_SORT_FIELDS_KEY] = sort_fields_json;
|
||||
collection_meta[COLLECTION_TOKEN_ORDERING_FIELD_KEY] = token_ordering_field;
|
||||
collection_meta[COLLECTION_TOKEN_ORDERING_FIELD_KEY] = token_ranking_field;
|
||||
|
||||
Collection* new_collection = new Collection(name, next_collection_id, 0, store, search_fields, facet_fields,
|
||||
sort_fields, token_ordering_field);
|
||||
sort_fields, token_ranking_field);
|
||||
|
||||
store->insert(Collection::get_meta_key(name), collection_meta.dump());
|
||||
store->insert(Collection::get_next_seq_id_key(name), std::to_string(0));
|
||||
|
@ -31,12 +31,14 @@ int main(int argc, char* argv[]) {
|
||||
};
|
||||
|
||||
std::vector<field> facet_fields_index = {
|
||||
// field("lang", field_types::STRING),
|
||||
// field("org", field_types::STRING),
|
||||
// field("topics", field_types::STRING_ARRAY)
|
||||
field("lang", field_types::STRING),
|
||||
field("org", field_types::STRING),
|
||||
field("topics", field_types::STRING_ARRAY)
|
||||
};
|
||||
|
||||
std::vector<field> sort_fields = { field("stars", "INT32")};
|
||||
std::vector<field> sort_fields = {
|
||||
field("stars", "INT32")
|
||||
};
|
||||
|
||||
Collection *collection = collectionManager.get_collection("github_top1k");
|
||||
|
||||
@ -45,7 +47,7 @@ int main(int argc, char* argv[]) {
|
||||
}
|
||||
|
||||
int j = 0;
|
||||
while(j < 1) {
|
||||
while(j < 1000) {
|
||||
j++;
|
||||
|
||||
std::ifstream infile(argv[1]);
|
||||
@ -53,11 +55,14 @@ int main(int argc, char* argv[]) {
|
||||
|
||||
cout << "BEGINNING Iteration: " << j << endl << flush;
|
||||
auto begin = std::chrono::high_resolution_clock::now();
|
||||
int doc_id = 0;
|
||||
|
||||
while (std::getline(infile, json_line)) {
|
||||
nlohmann::json document = nlohmann::json::parse(json_line);
|
||||
//document["id"] = std::to_string(doc_id);
|
||||
document["id"] = document["org"].get<std::string>() + ":" + document["repo_name"].get<std::string>();
|
||||
collection->add(document.dump());
|
||||
doc_id++;
|
||||
}
|
||||
|
||||
infile.close();
|
||||
@ -70,16 +75,14 @@ int main(int argc, char* argv[]) {
|
||||
|
||||
std::ifstream infile2(argv[1]);
|
||||
|
||||
int counter = 0;
|
||||
doc_id = 0;
|
||||
|
||||
while (std::getline(infile2, json_line)) {
|
||||
counter++;
|
||||
nlohmann::json document = nlohmann::json::parse(json_line);
|
||||
//document["id"] = std::to_string(doc_id);
|
||||
document["id"] = document["org"].get<std::string>() + ":" + document["repo_name"].get<std::string>();
|
||||
collection->remove(document["id"]);
|
||||
/*if (counter % 100 == 0) {
|
||||
std::cout << "Removed " << counter << " so far..." << std::endl;
|
||||
}*/
|
||||
doc_id++;
|
||||
}
|
||||
|
||||
infile2.close();
|
||||
|
@ -76,7 +76,7 @@ TEST_F(CollectionManagerTest, RestoreRecordsOnRestart) {
|
||||
ASSERT_EQ(1, collection1->get_sort_fields().size());
|
||||
ASSERT_EQ(sort_fields[0].name, collection1->get_sort_fields()[0].name);
|
||||
ASSERT_EQ(schema.size(), collection1->get_schema().size());
|
||||
ASSERT_EQ("points", collection1->get_token_ordering_field());
|
||||
ASSERT_EQ("points", collection1->get_token_ranking_field());
|
||||
|
||||
results = collection1->search("thomas", search_fields, "", facets, sort_fields, 0, 10, FREQUENCY, false);
|
||||
ASSERT_EQ(4, results["hits"].size());
|
||||
|
@ -882,14 +882,20 @@ TEST_F(CollectionTest, IndexingWithBadData) {
|
||||
ASSERT_TRUE(empty_facet_field_op.ok());
|
||||
|
||||
doc_str = "{\"name\": \"foo\", \"age\": \"34\", \"tags\": [], \"average\": 34 }";
|
||||
const Option<std::string> & bad_token_ordering_field_op1 = sample_collection->add(doc_str);
|
||||
ASSERT_FALSE(bad_token_ordering_field_op1.ok());
|
||||
ASSERT_STREQ("Token ordering field `age` must be an INT32.", bad_token_ordering_field_op1.error().c_str());
|
||||
const Option<std::string> & bad_token_ranking_field_op1 = sample_collection->add(doc_str);
|
||||
ASSERT_FALSE(bad_token_ranking_field_op1.ok());
|
||||
ASSERT_STREQ("Token ranking field `age` must be an INT32.", bad_token_ranking_field_op1.error().c_str());
|
||||
|
||||
doc_str = "{\"name\": \"foo\", \"age\": 343234324234233234, \"tags\": [], \"average\": 34 }";
|
||||
const Option<std::string> & bad_token_ordering_field_op2 = sample_collection->add(doc_str);
|
||||
ASSERT_FALSE(bad_token_ordering_field_op2.ok());
|
||||
ASSERT_STREQ("Token ordering field `age` exceeds maximum value of INT32.", bad_token_ordering_field_op2.error().c_str());
|
||||
const Option<std::string> & bad_token_ranking_field_op2 = sample_collection->add(doc_str);
|
||||
ASSERT_FALSE(bad_token_ranking_field_op2.ok());
|
||||
ASSERT_STREQ("Token ranking field `age` exceeds maximum value of INT32.", bad_token_ranking_field_op2.error().c_str());
|
||||
|
||||
doc_str = "{\"name\": \"foo\", \"tags\": [], \"average\": 34 }";
|
||||
const Option<std::string> & bad_token_ranking_field_op3 = sample_collection->add(doc_str);
|
||||
ASSERT_FALSE(bad_token_ranking_field_op3.ok());
|
||||
ASSERT_STREQ("Field `age` has been declared as a token ranking field, but is not found in the document.",
|
||||
bad_token_ranking_field_op3.error().c_str());
|
||||
|
||||
doc_str = "{\"name\": \"foo\", \"age\": 34, \"tags\": [], \"average\": \"34\"}";
|
||||
const Option<std::string> & bad_rank_field_op = sample_collection->add(doc_str);
|
||||
|
Loading…
x
Reference in New Issue
Block a user