From 1d3af330ddf2f6f802b996d9f38efbb965eb3028 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Sun, 28 Aug 2016 09:23:30 +0530 Subject: [PATCH] JSON document as input to `collection.add` method. --- TODO.md | 2 +- src/collection.cpp | 24 +++++++++++++----------- src/collection.h | 3 ++- src/main.cpp | 21 ++++++++------------- src/server.cpp | 20 +++++++------------- test/documents.jsonl | 16 ++++++++++++++++ test/documents.txt | 16 ---------------- 7 files changed, 47 insertions(+), 55 deletions(-) create mode 100644 test/documents.jsonl delete mode 100644 test/documents.txt diff --git a/TODO.md b/TODO.md index 7cb94a95..061f2fa9 100644 --- a/TODO.md +++ b/TODO.md @@ -4,7 +4,7 @@ **Search index** -- Proper JSON as input +- ~~Proper JSON as input~~ - Storing raw JSON input to RocksDB - ART for every indexed field - UTF-8 support for fuzzy search diff --git a/src/collection.cpp b/src/collection.cpp index deda470d..cac2aedc 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -2,12 +2,10 @@ #include #include -#include #include #include #include #include -#include Collection::Collection() { state = CollectionState(); @@ -18,8 +16,12 @@ Collection::~Collection() { art_tree_destroy(&t); } -void Collection::add(std::vector tokens, uint16_t score) { +void Collection::add(nlohmann::json document) { uint32_t doc_id = state.nextId(); + + uint16_t score = document["points"]; + std::vector tokens; + StringUtils::tokenize(document["title"], tokens, " ", true); std::unordered_map> token_to_offsets; for(uint32_t i=0; i tokens, uint16_t score) { } for(auto & kv: token_to_offsets) { - art_document document; - document.id = doc_id; - document.score = score; - document.offsets_len = (uint32_t) kv.second.size(); - document.offsets = new uint32_t[kv.second.size()]; + art_document art_doc; + art_doc.id = doc_id; + art_doc.score = score; + art_doc.offsets_len = (uint32_t) kv.second.size(); + art_doc.offsets = new uint32_t[kv.second.size()]; uint32_t num_hits = 0; @@ -48,11 +50,11 @@ void Collection::add(std::vector tokens, uint16_t score) { num_hits += 1; for(auto i=0; i #include #include +#include class Collection { private: @@ -15,7 +16,7 @@ private: public: Collection(); ~Collection(); - void add(std::vector tokens, uint16_t score); + void add(nlohmann::json document); void search(std::string query, size_t max_results); static inline std::vector _next_suggestion(const std::vector> &token_leaves, diff --git a/src/main.cpp b/src/main.cpp index 3eacb753..822cb1f2 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -7,29 +7,24 @@ #include #include "string_utils.h" #include "collection.h" +#include "json.hpp" using namespace std; int main() { Collection *collection = new Collection(); - std::ifstream infile("/Users/kishore/others/wreally/typesense/test/documents.txt"); - //std::ifstream infile("/Users/kishore/Downloads/hnstories.tsv"); + std::ifstream infile("/Users/kishore/others/wreally/typesense/test/documents.jsonl"); + //std::ifstream infile("/Users/kishore/Downloads/hnstories.jsonl"); - std::string line; + std::string jsonline; - while (std::getline(infile, line)) { - vector parts; - StringUtils::tokenize(line, parts, "\t", true); - line = StringUtils::replace_all(line, "\"", ""); - - vector tokens; - StringUtils::tokenize(parts[0], tokens, " ", true); - - if(parts.size() != 2) continue; - collection->add(tokens, stoi(parts[1])); + while (std::getline(infile, jsonline)) { + nlohmann::json document = nlohmann::json::parse(jsonline); + collection->add(document); } + infile.close(); cout << "FINISHED INDEXING!" << endl << flush; auto begin = std::chrono::high_resolution_clock::now(); diff --git a/src/server.cpp b/src/server.cpp index 3769b606..f1dea67e 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -145,23 +145,17 @@ static int create_listener(void) { } void index_documents() { - //std::ifstream infile("/Users/kishore/others/wreally/typesense/test/documents.txt"); - std::ifstream infile("/Users/kishore/Downloads/hnstories.tsv"); + //std::ifstream infile("/Users/kishore/others/wreally/typesense/test/documents.jsonl"); + std::ifstream infile("/Users/kishore/Downloads/hnstories.jsonl"); - std::string line; + std::string jsonline; - while (std::getline(infile, line)) { - std::vector parts; - StringUtils::tokenize(line, parts, "\t", true); - line = StringUtils::replace_all(line, "\"", ""); - - std::vector tokens; - StringUtils::tokenize(parts[0], tokens, " ", true); - - if(parts.size() != 2) continue; - collection->add(tokens, stoi(parts[1])); + while (std::getline(infile, jsonline)) { + nlohmann::json document = nlohmann::json::parse(jsonline); + collection->add(document); } + infile.close(); std::cout << "FINISHED INDEXING!" << std::endl << std::flush; } diff --git a/test/documents.jsonl b/test/documents.jsonl new file mode 100644 index 00000000..89c0c552 --- /dev/null +++ b/test/documents.jsonl @@ -0,0 +1,16 @@ +{"points":15,"title":"How are cryogenic rocket propellants delivered to the launch pad?"} +{"points":14,"title":"Are there any (free) are online data archives for data from instruments on Soviet / Russian missions?"} +{"points":13,"title":"Where should I look in ISS to find mouldy food?"} +{"points":13,"title":"Is solar system active cryovolcanism a potential viable power source for future colonies?"} +{"points":13,"title":"The heaviest martian spacecraft"} +{"points":13,"title":"To what extent are the US modules of ISS based on the Spacelab design?"} +{"points":12,"title":"Could future astronauts eat during EVAs?"} +{"points":12,"title":"What is the power requirement of a spacesuit?"} +{"points":12,"title":"How does plant growing medium not scatter around?"} +{"points":12,"title":"Is there research for the optimal small crew size for a long space voyage?"} +{"points":12,"title":"Do long term missions receive insurance coverage?"} +{"points":12,"title":"What do they exactly look for when searching for extraterrestrial intelligence?"} +{"points":11,"title":"What were emergency procedures for failure of launch vehicles with nuclear upper stages?"} +{"points":11,"title":"Mathematics used for F9R flyback lunch and landing"} +{"points":11,"title":"What considerations have been made lunch for waste produced during colonisation?"} +{"points":10,"title":"Do late do the propellants lunch ionize in chemical rockets?"} \ No newline at end of file diff --git a/test/documents.txt b/test/documents.txt deleted file mode 100644 index d276c2a6..00000000 --- a/test/documents.txt +++ /dev/null @@ -1,16 +0,0 @@ -How are cryogenic rocket propellants delivered to the launch pad? 15 -Are there any (free) are online data archives for data from instruments on Soviet / Russian missions? 14 -Where should I look in ISS to find mouldy food? 13 -Is solar system active cryovolcanism a potential viable power source for future colonies? 13 -The heaviest martian spacecraft 13 -To what extent are the US modules of ISS based on the Spacelab design? 13 -Could future astronauts eat during EVAs? 12 -What is the power requirement of a spacesuit? 12 -How does plant growing medium not scatter around? 12 -Is there research for the optimal small crew size for a long space voyage? 12 -Do long term missions receive insurance coverage? 12 -What do they exactly look for when searching for extraterrestrial intelligence? 12 -What were emergency procedures for failure of launch vehicles with nuclear upper stages? 11 -Mathematics used for F9R flyback lunch and landing 11 -What considerations have been made lunch for waste produced during colonisation? 11 -Do late do the propellants lunch ionize in chemical rockets? 10 \ No newline at end of file