Kishore Nallan 32cd67c9d1 The ART will store the frequency count in addition to the score.
In certain cases, the ability to identify the most similar tokens based on the popularity of the token is useful.
2016-06-08 22:18:52 +05:30

250 lines
5.9 KiB
C++

#ifndef ART_H
#define ART_H
#include <stdint.h>
#include <stdbool.h>
#include <vector>
#include "forarray.h"
#define IGNORE_PRINTF 1
#ifdef __cplusplus
extern "C" {
#endif
#define NODE4 1
#define NODE16 2
#define NODE48 3
#define NODE256 4
#define MAX_PREFIX_LEN 10
#define MAX(x, y) (((x) > (y)) ? (x) : (y))
#if defined(__GNUC__) && !defined(__clang__)
# if __STDC_VERSION__ >= 199901L && 402 == (__GNUC__ * 100 + __GNUC_MINOR__)
/*
* GCC 4.2.2's C99 inline keyword support is pretty broken; avoid. Introduced in
* GCC 4.2.something, fixed in 4.3.0. So checking for specific major.minor of
* 4.2 is fine.
*/
# define BROKEN_GCC_C99_INLINE
# endif
#endif
typedef int(*art_callback)(void *data, const unsigned char *key, uint32_t key_len, void *value);
/**
* This struct is included as part
* of all the various node sizes
*/
typedef struct {
uint8_t type;
uint8_t num_children;
uint16_t max_score;
uint32_t max_token_count;
uint32_t partial_len;
unsigned char partial[MAX_PREFIX_LEN];
} art_node;
/**
* Small node with only 4 children
*/
typedef struct {
art_node n;
unsigned char keys[4];
art_node *children[4];
} art_node4;
/**
* Node with 16 children
*/
typedef struct {
art_node n;
unsigned char keys[16];
art_node *children[16];
} art_node16;
/**
* Node with 48 children, but
* a full 256 byte field.
*/
typedef struct {
art_node n;
unsigned char keys[256];
art_node *children[48];
} art_node48;
/**
* Full node with 256 children
*/
typedef struct {
art_node n;
art_node *children[256];
} art_node256;
/**
* Container for holding the documents that belong to a leaf.
*/
typedef struct {
forarray ids;
forarray offset_index;
forarray offsets;
} art_values;
/*
* Represents a document to be indexed.
* `offsets` refer to the index locations where a token appeared in the document
*/
typedef struct {
uint16_t score;
uint32_t id;
uint32_t offsets_len;
uint32_t* offsets;
} art_document;
/**
* Represents a leaf. These are
* of arbitrary size, as they include the key.
*/
typedef struct {
uint16_t max_score;
uint32_t token_count;
uint32_t key_len;
art_values* values;
unsigned char key[];
} art_leaf;
/**
* Main struct, points to root.
*/
typedef struct {
art_node *root;
uint64_t size;
} art_tree;
/**
* Initializes an ART tree
* @return 0 on success.
*/
int art_tree_init(art_tree *t);
/**
* DEPRECATED
* Initializes an ART tree
* @return 0 on success.
*/
#define init_art_tree(...) art_tree_init(__VA_ARGS__)
/**
* Destroys an ART tree
* @return 0 on success.
*/
int art_tree_destroy(art_tree *t);
/**
* DEPRECATED
* Initializes an ART tree
* @return 0 on success.
*/
#define destroy_art_tree(...) art_tree_destroy(__VA_ARGS__)
/**
* Returns the size of the ART tree.
*/
#ifdef BROKEN_GCC_C99_INLINE
# define art_size(t) ((t)->size)
#else
inline uint64_t art_size(art_tree *t) {
return t->size;
}
#endif
/**
* Inserts a new value into the ART tree
* @arg t The tree
* @arg key The key
* @arg key_len The length of the key
* @arg value Opaque value.
* @return NULL if the item was newly inserted, otherwise
* the old value pointer is returned.
*/
void* art_insert(art_tree *t, const unsigned char *key, int key_len, art_document* document, uint32_t num_hits);
/**
* Deletes a value from the ART tree
* @arg t The tree
* @arg key The key
* @arg key_len The length of the key
* @return NULL if the item was not found, otherwise
* the value pointer is returned.
*/
void* art_delete(art_tree *t, const unsigned char *key, int key_len);
/**
* Searches for a value in the ART tree
* @arg t The tree
* @arg key The key
* @arg key_len The length of the key
* @return NULL if the item was not found, otherwise
* the value pointer is returned.
*/
void* art_search(const art_tree *t, const unsigned char *key, int key_len);
/**
* Returns the minimum valued leaf
* @return The minimum leaf or NULL
*/
art_leaf* art_minimum(art_tree *t);
/**
* Returns the maximum valued leaf
* @return The maximum leaf or NULL
*/
art_leaf* art_maximum(art_tree *t);
/**
* Iterates through the entries pairs in the map,
* invoking a callback for each. The call back gets a
* key, value for each and returns an integer stop value.
* If the callback returns non-zero, then the iteration stops.
* @arg t The tree to iterate over
* @arg cb The callback function to invoke
* @arg data Opaque handle passed to the callback
* @return 0 on success, or the return of the callback.
*/
int art_iter(art_tree *t, art_callback cb, void *data);
/**
* Iterates through the entries pairs in the map,
* invoking a callback for each that matches a given prefix.
* The call back gets a key, value for each and returns an integer stop value.
* If the callback returns non-zero, then the iteration stops.
* @arg t The tree to iterate over
* @arg prefix The prefix of keys to read
* @arg prefix_len The length of the prefix
* @arg cb The callback function to invoke
* @arg data Opaque handle passed to the callback
* @return 0 on success, or the return of the callback.
*/
int art_iter_prefix(art_tree *t, const unsigned char *prefix, int prefix_len, art_callback cb, void *data);
/**
* Iterates through the entries pairs in the map,
* invoking a callback for each that matches a given prefix within a fuzzy distance of 2.
* The call back gets a key, value for each and returns an integer stop value.
* If the callback returns non-zero, then the iteration stops.
* @arg t The tree to iterate over
* @arg prefix The prefix of keys to read
* @arg prefix_len The length of the prefix
* @arg cb The callback function to invoke
* @arg data Opaque handle passed to the callback
* @return 0 on success, or the return of the callback.
*/
int art_iter_fuzzy_prefix(art_tree *t, const unsigned char *term, int term_len, int max_cost, int max_words, std::vector<art_leaf*> & results);
#ifdef __cplusplus
}
#endif
#endif