mirror of
https://github.com/typesense/typesense.git
synced 2025-05-20 13:42:26 +08:00
JSON document as input to collection.add
method.
This commit is contained in:
parent
2804b145dd
commit
1d3af330dd
2
TODO.md
2
TODO.md
@ -4,7 +4,7 @@
|
||||
|
||||
**Search index**
|
||||
|
||||
- Proper JSON as input
|
||||
- ~~Proper JSON as input~~
|
||||
- Storing raw JSON input to RocksDB
|
||||
- ART for every indexed field
|
||||
- UTF-8 support for fuzzy search
|
||||
|
@ -2,12 +2,10 @@
|
||||
|
||||
#include <iostream>
|
||||
#include <numeric>
|
||||
#include <unordered_map>
|
||||
#include <topster.h>
|
||||
#include <intersection.h>
|
||||
#include <match_score.h>
|
||||
#include <string_utils.h>
|
||||
#include <art.h>
|
||||
|
||||
Collection::Collection() {
|
||||
state = CollectionState();
|
||||
@ -18,8 +16,12 @@ Collection::~Collection() {
|
||||
art_tree_destroy(&t);
|
||||
}
|
||||
|
||||
void Collection::add(std::vector<std::string> tokens, uint16_t score) {
|
||||
void Collection::add(nlohmann::json document) {
|
||||
uint32_t doc_id = state.nextId();
|
||||
|
||||
uint16_t score = document["points"];
|
||||
std::vector<std::string> tokens;
|
||||
StringUtils::tokenize(document["title"], tokens, " ", true);
|
||||
std::unordered_map<std::string, std::vector<uint32_t>> token_to_offsets;
|
||||
|
||||
for(uint32_t i=0; i<tokens.size(); i++) {
|
||||
@ -29,11 +31,11 @@ void Collection::add(std::vector<std::string> tokens, uint16_t score) {
|
||||
}
|
||||
|
||||
for(auto & kv: token_to_offsets) {
|
||||
art_document document;
|
||||
document.id = doc_id;
|
||||
document.score = score;
|
||||
document.offsets_len = (uint32_t) kv.second.size();
|
||||
document.offsets = new uint32_t[kv.second.size()];
|
||||
art_document art_doc;
|
||||
art_doc.id = doc_id;
|
||||
art_doc.score = score;
|
||||
art_doc.offsets_len = (uint32_t) kv.second.size();
|
||||
art_doc.offsets = new uint32_t[kv.second.size()];
|
||||
|
||||
uint32_t num_hits = 0;
|
||||
|
||||
@ -48,11 +50,11 @@ void Collection::add(std::vector<std::string> tokens, uint16_t score) {
|
||||
num_hits += 1;
|
||||
|
||||
for(auto i=0; i<kv.second.size(); i++) {
|
||||
document.offsets[i] = kv.second[i];
|
||||
art_doc.offsets[i] = kv.second[i];
|
||||
}
|
||||
|
||||
art_insert(&t, (const unsigned char *) key, key_len, &document, num_hits);
|
||||
delete document.offsets;
|
||||
art_insert(&t, key, key_len, &art_doc, num_hits);
|
||||
delete art_doc.offsets;
|
||||
}
|
||||
|
||||
doc_scores[doc_id] = score;
|
||||
|
@ -6,6 +6,7 @@
|
||||
#include <unordered_map>
|
||||
#include <collection_state.h>
|
||||
#include <topster.h>
|
||||
#include <json.hpp>
|
||||
|
||||
class Collection {
|
||||
private:
|
||||
@ -15,7 +16,7 @@ private:
|
||||
public:
|
||||
Collection();
|
||||
~Collection();
|
||||
void add(std::vector<std::string> tokens, uint16_t score);
|
||||
void add(nlohmann::json document);
|
||||
void search(std::string query, size_t max_results);
|
||||
|
||||
static inline std::vector<art_leaf *> _next_suggestion(const std::vector<std::vector<art_leaf *>> &token_leaves,
|
||||
|
21
src/main.cpp
21
src/main.cpp
@ -7,29 +7,24 @@
|
||||
#include <unordered_map>
|
||||
#include "string_utils.h"
|
||||
#include "collection.h"
|
||||
#include "json.hpp"
|
||||
|
||||
using namespace std;
|
||||
|
||||
int main() {
|
||||
Collection *collection = new Collection();
|
||||
|
||||
std::ifstream infile("/Users/kishore/others/wreally/typesense/test/documents.txt");
|
||||
//std::ifstream infile("/Users/kishore/Downloads/hnstories.tsv");
|
||||
std::ifstream infile("/Users/kishore/others/wreally/typesense/test/documents.jsonl");
|
||||
//std::ifstream infile("/Users/kishore/Downloads/hnstories.jsonl");
|
||||
|
||||
std::string line;
|
||||
std::string jsonline;
|
||||
|
||||
while (std::getline(infile, line)) {
|
||||
vector<string> parts;
|
||||
StringUtils::tokenize(line, parts, "\t", true);
|
||||
line = StringUtils::replace_all(line, "\"", "");
|
||||
|
||||
vector<string> tokens;
|
||||
StringUtils::tokenize(parts[0], tokens, " ", true);
|
||||
|
||||
if(parts.size() != 2) continue;
|
||||
collection->add(tokens, stoi(parts[1]));
|
||||
while (std::getline(infile, jsonline)) {
|
||||
nlohmann::json document = nlohmann::json::parse(jsonline);
|
||||
collection->add(document);
|
||||
}
|
||||
|
||||
infile.close();
|
||||
cout << "FINISHED INDEXING!" << endl << flush;
|
||||
|
||||
auto begin = std::chrono::high_resolution_clock::now();
|
||||
|
@ -145,23 +145,17 @@ static int create_listener(void) {
|
||||
}
|
||||
|
||||
void index_documents() {
|
||||
//std::ifstream infile("/Users/kishore/others/wreally/typesense/test/documents.txt");
|
||||
std::ifstream infile("/Users/kishore/Downloads/hnstories.tsv");
|
||||
//std::ifstream infile("/Users/kishore/others/wreally/typesense/test/documents.jsonl");
|
||||
std::ifstream infile("/Users/kishore/Downloads/hnstories.jsonl");
|
||||
|
||||
std::string line;
|
||||
std::string jsonline;
|
||||
|
||||
while (std::getline(infile, line)) {
|
||||
std::vector<std::string> parts;
|
||||
StringUtils::tokenize(line, parts, "\t", true);
|
||||
line = StringUtils::replace_all(line, "\"", "");
|
||||
|
||||
std::vector<std::string> tokens;
|
||||
StringUtils::tokenize(parts[0], tokens, " ", true);
|
||||
|
||||
if(parts.size() != 2) continue;
|
||||
collection->add(tokens, stoi(parts[1]));
|
||||
while (std::getline(infile, jsonline)) {
|
||||
nlohmann::json document = nlohmann::json::parse(jsonline);
|
||||
collection->add(document);
|
||||
}
|
||||
|
||||
infile.close();
|
||||
std::cout << "FINISHED INDEXING!" << std::endl << std::flush;
|
||||
}
|
||||
|
||||
|
16
test/documents.jsonl
Normal file
16
test/documents.jsonl
Normal file
@ -0,0 +1,16 @@
|
||||
{"points":15,"title":"How are cryogenic rocket propellants delivered to the launch pad?"}
|
||||
{"points":14,"title":"Are there any (free) are online data archives for data from instruments on Soviet / Russian missions?"}
|
||||
{"points":13,"title":"Where should I look in ISS to find mouldy food?"}
|
||||
{"points":13,"title":"Is solar system active cryovolcanism a potential viable power source for future colonies?"}
|
||||
{"points":13,"title":"The heaviest martian spacecraft"}
|
||||
{"points":13,"title":"To what extent are the US modules of ISS based on the Spacelab design?"}
|
||||
{"points":12,"title":"Could future astronauts eat during EVAs?"}
|
||||
{"points":12,"title":"What is the power requirement of a spacesuit?"}
|
||||
{"points":12,"title":"How does plant growing medium not scatter around?"}
|
||||
{"points":12,"title":"Is there research for the optimal small crew size for a long space voyage?"}
|
||||
{"points":12,"title":"Do long term missions receive insurance coverage?"}
|
||||
{"points":12,"title":"What do they exactly look for when searching for extraterrestrial intelligence?"}
|
||||
{"points":11,"title":"What were emergency procedures for failure of launch vehicles with nuclear upper stages?"}
|
||||
{"points":11,"title":"Mathematics used for F9R flyback lunch and landing"}
|
||||
{"points":11,"title":"What considerations have been made lunch for waste produced during colonisation?"}
|
||||
{"points":10,"title":"Do late do the propellants lunch ionize in chemical rockets?"}
|
@ -1,16 +0,0 @@
|
||||
How are cryogenic rocket propellants delivered to the launch pad? 15
|
||||
Are there any (free) are online data archives for data from instruments on Soviet / Russian missions? 14
|
||||
Where should I look in ISS to find mouldy food? 13
|
||||
Is solar system active cryovolcanism a potential viable power source for future colonies? 13
|
||||
The heaviest martian spacecraft 13
|
||||
To what extent are the US modules of ISS based on the Spacelab design? 13
|
||||
Could future astronauts eat during EVAs? 12
|
||||
What is the power requirement of a spacesuit? 12
|
||||
How does plant growing medium not scatter around? 12
|
||||
Is there research for the optimal small crew size for a long space voyage? 12
|
||||
Do long term missions receive insurance coverage? 12
|
||||
What do they exactly look for when searching for extraterrestrial intelligence? 12
|
||||
What were emergency procedures for failure of launch vehicles with nuclear upper stages? 11
|
||||
Mathematics used for F9R flyback lunch and landing 11
|
||||
What considerations have been made lunch for waste produced during colonisation? 11
|
||||
Do late do the propellants lunch ionize in chemical rockets? 10
|
Loading…
x
Reference in New Issue
Block a user