JSON document as input to collection.add method.

This commit is contained in:
Kishore Nallan 2016-08-28 09:23:30 +05:30
parent 2804b145dd
commit 1d3af330dd
7 changed files with 47 additions and 55 deletions

View File

@ -4,7 +4,7 @@
**Search index**
- Proper JSON as input
- ~~Proper JSON as input~~
- Storing raw JSON input to RocksDB
- ART for every indexed field
- UTF-8 support for fuzzy search

View File

@ -2,12 +2,10 @@
#include <iostream>
#include <numeric>
#include <unordered_map>
#include <topster.h>
#include <intersection.h>
#include <match_score.h>
#include <string_utils.h>
#include <art.h>
Collection::Collection() {
state = CollectionState();
@ -18,8 +16,12 @@ Collection::~Collection() {
art_tree_destroy(&t);
}
void Collection::add(std::vector<std::string> tokens, uint16_t score) {
void Collection::add(nlohmann::json document) {
uint32_t doc_id = state.nextId();
uint16_t score = document["points"];
std::vector<std::string> tokens;
StringUtils::tokenize(document["title"], tokens, " ", true);
std::unordered_map<std::string, std::vector<uint32_t>> token_to_offsets;
for(uint32_t i=0; i<tokens.size(); i++) {
@ -29,11 +31,11 @@ void Collection::add(std::vector<std::string> tokens, uint16_t score) {
}
for(auto & kv: token_to_offsets) {
art_document document;
document.id = doc_id;
document.score = score;
document.offsets_len = (uint32_t) kv.second.size();
document.offsets = new uint32_t[kv.second.size()];
art_document art_doc;
art_doc.id = doc_id;
art_doc.score = score;
art_doc.offsets_len = (uint32_t) kv.second.size();
art_doc.offsets = new uint32_t[kv.second.size()];
uint32_t num_hits = 0;
@ -48,11 +50,11 @@ void Collection::add(std::vector<std::string> tokens, uint16_t score) {
num_hits += 1;
for(auto i=0; i<kv.second.size(); i++) {
document.offsets[i] = kv.second[i];
art_doc.offsets[i] = kv.second[i];
}
art_insert(&t, (const unsigned char *) key, key_len, &document, num_hits);
delete document.offsets;
art_insert(&t, key, key_len, &art_doc, num_hits);
delete art_doc.offsets;
}
doc_scores[doc_id] = score;

View File

@ -6,6 +6,7 @@
#include <unordered_map>
#include <collection_state.h>
#include <topster.h>
#include <json.hpp>
class Collection {
private:
@ -15,7 +16,7 @@ private:
public:
Collection();
~Collection();
void add(std::vector<std::string> tokens, uint16_t score);
void add(nlohmann::json document);
void search(std::string query, size_t max_results);
static inline std::vector<art_leaf *> _next_suggestion(const std::vector<std::vector<art_leaf *>> &token_leaves,

View File

@ -7,29 +7,24 @@
#include <unordered_map>
#include "string_utils.h"
#include "collection.h"
#include "json.hpp"
using namespace std;
int main() {
Collection *collection = new Collection();
std::ifstream infile("/Users/kishore/others/wreally/typesense/test/documents.txt");
//std::ifstream infile("/Users/kishore/Downloads/hnstories.tsv");
std::ifstream infile("/Users/kishore/others/wreally/typesense/test/documents.jsonl");
//std::ifstream infile("/Users/kishore/Downloads/hnstories.jsonl");
std::string line;
std::string jsonline;
while (std::getline(infile, line)) {
vector<string> parts;
StringUtils::tokenize(line, parts, "\t", true);
line = StringUtils::replace_all(line, "\"", "");
vector<string> tokens;
StringUtils::tokenize(parts[0], tokens, " ", true);
if(parts.size() != 2) continue;
collection->add(tokens, stoi(parts[1]));
while (std::getline(infile, jsonline)) {
nlohmann::json document = nlohmann::json::parse(jsonline);
collection->add(document);
}
infile.close();
cout << "FINISHED INDEXING!" << endl << flush;
auto begin = std::chrono::high_resolution_clock::now();

View File

@ -145,23 +145,17 @@ static int create_listener(void) {
}
void index_documents() {
//std::ifstream infile("/Users/kishore/others/wreally/typesense/test/documents.txt");
std::ifstream infile("/Users/kishore/Downloads/hnstories.tsv");
//std::ifstream infile("/Users/kishore/others/wreally/typesense/test/documents.jsonl");
std::ifstream infile("/Users/kishore/Downloads/hnstories.jsonl");
std::string line;
std::string jsonline;
while (std::getline(infile, line)) {
std::vector<std::string> parts;
StringUtils::tokenize(line, parts, "\t", true);
line = StringUtils::replace_all(line, "\"", "");
std::vector<std::string> tokens;
StringUtils::tokenize(parts[0], tokens, " ", true);
if(parts.size() != 2) continue;
collection->add(tokens, stoi(parts[1]));
while (std::getline(infile, jsonline)) {
nlohmann::json document = nlohmann::json::parse(jsonline);
collection->add(document);
}
infile.close();
std::cout << "FINISHED INDEXING!" << std::endl << std::flush;
}

16
test/documents.jsonl Normal file
View File

@ -0,0 +1,16 @@
{"points":15,"title":"How are cryogenic rocket propellants delivered to the launch pad?"}
{"points":14,"title":"Are there any (free) are online data archives for data from instruments on Soviet / Russian missions?"}
{"points":13,"title":"Where should I look in ISS to find mouldy food?"}
{"points":13,"title":"Is solar system active cryovolcanism a potential viable power source for future colonies?"}
{"points":13,"title":"The heaviest martian spacecraft"}
{"points":13,"title":"To what extent are the US modules of ISS based on the Spacelab design?"}
{"points":12,"title":"Could future astronauts eat during EVAs?"}
{"points":12,"title":"What is the power requirement of a spacesuit?"}
{"points":12,"title":"How does plant growing medium not scatter around?"}
{"points":12,"title":"Is there research for the optimal small crew size for a long space voyage?"}
{"points":12,"title":"Do long term missions receive insurance coverage?"}
{"points":12,"title":"What do they exactly look for when searching for extraterrestrial intelligence?"}
{"points":11,"title":"What were emergency procedures for failure of launch vehicles with nuclear upper stages?"}
{"points":11,"title":"Mathematics used for F9R flyback lunch and landing"}
{"points":11,"title":"What considerations have been made lunch for waste produced during colonisation?"}
{"points":10,"title":"Do late do the propellants lunch ionize in chemical rockets?"}

View File

@ -1,16 +0,0 @@
How are cryogenic rocket propellants delivered to the launch pad? 15
Are there any (free) are online data archives for data from instruments on Soviet / Russian missions? 14
Where should I look in ISS to find mouldy food? 13
Is solar system active cryovolcanism a potential viable power source for future colonies? 13
The heaviest martian spacecraft 13
To what extent are the US modules of ISS based on the Spacelab design? 13
Could future astronauts eat during EVAs? 12
What is the power requirement of a spacesuit? 12
How does plant growing medium not scatter around? 12
Is there research for the optimal small crew size for a long space voyage? 12
Do long term missions receive insurance coverage? 12
What do they exactly look for when searching for extraterrestrial intelligence? 12
What were emergency procedures for failure of launch vehicles with nuclear upper stages? 11
Mathematics used for F9R flyback lunch and landing 11
What considerations have been made lunch for waste produced during colonisation? 11
Do late do the propellants lunch ionize in chemical rockets? 10