nes-proj/os/storage/antelope/index-maxheap.c
2017-09-22 17:09:54 +02:00

751 lines
18 KiB
C

/*
* Copyright (c) 2010, Swedish Institute of Computer Science
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the Institute nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/**
* \file
* MaxHeap - A binary maximum heap index for flash memory.
*
* The idea behind the MaxHeap index is to write entries sequentially
* into small buckets, which are indexed in a binary maximum heap.
* Although sequential writes make the entries unsorted within a
* bucket, the time to load and scan a single bucket is small. The
* sequential write is important for flash memories, which are
* unable to handle multiple rewrites of the same page without doing
* an expensive erase operation between the rewrites.
*
* Each bucket specifies a range (a,b) of values that it accepts.
* Once a bucket fills up, two buckets are created with the ranges
* (a,mean) and (mean+1, b), respectively. The entries from the
* original bucket are then copied into the appropriate new bucket
* before the old bucket gets deleted.
* \author
* Nicolas Tsiftes <nvt@sics.se>
*/
#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "cfs/cfs.h"
#include "cfs/cfs-coffee.h"
#include "lib/memb.h"
#include "lib/random.h"
#include "db-options.h"
#include "index.h"
#include "result.h"
#include "storage.h"
#define DEBUG DEBUG_NONE
#include "net/ipv6/uip-debug.h"
#define BRANCH_FACTOR 2
#define BUCKET_SIZE 128
#define NODE_LIMIT 511
#define NODE_DEPTH 9
#if (1 << NODE_DEPTH) != (NODE_LIMIT + 1)
#error "NODE_DEPTH is set incorrectly."
#endif
#define EMPTY_NODE(node) ((node)->min == 0 && (node)->max == 0)
#define EMPTY_PAIR(pair) ((pair)->key == 0 && (pair)->value == 0)
typedef uint16_t maxheap_key_t;
typedef uint16_t maxheap_value_t;
#define KEY_MIN 0
#define KEY_MAX 65535
struct heap_node {
maxheap_key_t min;
maxheap_key_t max;
};
typedef struct heap_node heap_node_t;
struct key_value_pair {
maxheap_key_t key;
maxheap_value_t value;
};
struct bucket {
struct key_value_pair pairs[BUCKET_SIZE];
};
typedef struct bucket bucket_t;
struct heap {
db_storage_id_t heap_storage;
db_storage_id_t bucket_storage;
/* Remember where the next free slot for each bucket is located. */
uint8_t next_free_slot[NODE_LIMIT];
};
typedef struct heap heap_t;
struct bucket_cache {
heap_t *heap;
uint16_t bucket_id;
bucket_t bucket;
};
/* Keep a cache of buckets read from storage. */
static struct bucket_cache bucket_cache[DB_HEAP_CACHE_LIMIT];
MEMB(heaps, heap_t, DB_HEAP_INDEX_LIMIT);
static struct bucket_cache *get_cache(heap_t *, int);
static struct bucket_cache *get_cache_free(void);
static void invalidate_cache(void);
static maxheap_key_t transform_key(maxheap_key_t);
static int heap_read(heap_t *, int, heap_node_t *);
static int heap_write(heap_t *, int, heap_node_t *);
static int heap_insert(heap_t *, maxheap_key_t, maxheap_key_t);
static int heap_find(heap_t *, maxheap_key_t key, int *iterator);
#if HEAP_DEBUG
static void heap_print(heap_t *);
#endif
static int bucket_read(heap_t *, int, bucket_t *);
static struct bucket_cache *bucket_load(heap_t *, int);
static int bucket_append(heap_t *, int, struct key_value_pair *);
static int bucket_split(heap_t *, int);
static db_result_t create(index_t *);
static db_result_t destroy(index_t *);
static db_result_t load(index_t *);
static db_result_t release(index_t *);
static db_result_t insert(index_t *, attribute_value_t *, tuple_id_t);
static db_result_t delete(index_t *, attribute_value_t *);
static tuple_id_t get_next(index_iterator_t *);
index_api_t index_maxheap = {
INDEX_MAXHEAP,
INDEX_API_EXTERNAL,
create,
destroy,
load,
release,
insert,
delete,
get_next
};
static struct bucket_cache *
get_cache(heap_t *heap, int bucket_id)
{
int i;
for(i = 0; i < DB_HEAP_CACHE_LIMIT; i++) {
if(bucket_cache[i].heap == heap && bucket_cache[i].bucket_id == bucket_id) {
return &bucket_cache[i];
}
}
return NULL;
}
static struct bucket_cache *
get_cache_free(void)
{
int i;
for(i = 0; i < DB_HEAP_CACHE_LIMIT; i++) {
if(bucket_cache[i].heap == NULL) {
return &bucket_cache[i];
}
}
return NULL;
}
static void
invalidate_cache(void)
{
int i;
for(i = 0; i < DB_HEAP_CACHE_LIMIT; i++) {
if(bucket_cache[i].heap != NULL) {
bucket_cache[i].heap = NULL;
break;
}
}
}
static maxheap_key_t
transform_key(maxheap_key_t key)
{
random_init(key);
return random_rand();
}
static int
heap_read(heap_t *heap, int bucket_id, heap_node_t *node)
{
if(DB_ERROR(storage_read(heap->heap_storage, node,
DB_MAX_FILENAME_LENGTH + (unsigned long)bucket_id * sizeof(*node), sizeof(*node)))) {
return 0;
}
return 1;
}
static int
heap_write(heap_t *heap, int bucket_id, heap_node_t *node)
{
if(DB_ERROR(storage_write(heap->heap_storage, node,
DB_MAX_FILENAME_LENGTH + (unsigned long)bucket_id * sizeof(*node), sizeof(*node)))) {
return 0;
}
return 1;
}
static int
heap_insert(heap_t *heap, maxheap_key_t min, maxheap_key_t max)
{
int i;
heap_node_t node;
PRINTF("DB: Insert node (%ld,%ld) into the heap\n", (long)min, (long)max);
if(min > max) {
return -1;
}
for(i = 0; i < NODE_LIMIT;) {
if(heap_read(heap, i, &node) == 0) {
PRINTF("DB: Failed to read heap node %d\n", i);
return -1;
}
if(EMPTY_NODE(&node)) {
node.min = min;
node.max = max;
if(heap_write(heap, i, &node) == 0) {
PRINTF("DB: Failed to write heap node %d\n", i);
return -1;
}
return i;
} else if(node.min <= min && max <= node.max) {
i = BRANCH_FACTOR * i + 1;
} else {
i++;
}
}
PRINTF("DB: No more nodes available\n");
return -1;
}
static int
heap_find(heap_t *heap, maxheap_key_t key, int *iterator)
{
maxheap_key_t hashed_key;
int i;
int first_child;
static heap_node_t node;
hashed_key = transform_key(key);
for(i = *iterator; i < NODE_LIMIT;) {
if(heap_read(heap, i, &node) == 0) {
break;
}
if(EMPTY_NODE(&node)) {
break;
} else if(node.min <= hashed_key && hashed_key <= node.max) {
first_child = BRANCH_FACTOR * i + 1;
if(first_child >= NODE_LIMIT) {
break;
}
*iterator = first_child;
return i;
} else {
i++;
}
}
return -1;
}
#if HEAP_DEBUG
static void
heap_print(heap_t *heap)
{
int level_count;
int branch_count;
int branch_amount;
int i, j;
heap_node_t node;
level_count = 0;
branch_count = 0;
branch_amount = BRANCH_FACTOR;
for(i = 0;; i++) {
if(heap_read(heap, i, &node) == 0 || EMPTY_NODE(&node)) {
break;
}
for(j = 0; j < level_count; j++) {
PRINTF("\t");
}
PRINTF("(%ld,%ld)\n", (long)node.min, (long)node.max);
if(level_count == 0) {
level_count++;
} else if(branch_count + 1 == branch_amount) {
level_count++;
branch_count = 0;
branch_amount = branch_amount * BRANCH_FACTOR;
} else {
branch_count++;
}
}
}
#endif /* HEAP_DEBUG */
static int
bucket_read(heap_t *heap, int bucket_id, bucket_t *bucket)
{
size_t size;
if(heap->next_free_slot[bucket_id] == 0) {
size = BUCKET_SIZE;
} else {
size = heap->next_free_slot[bucket_id];
}
size *= sizeof(struct key_value_pair);
if(DB_ERROR(storage_read(heap->bucket_storage, bucket,
(unsigned long)bucket_id * sizeof(*bucket), size))) {
return 0;
}
return 1;
}
static struct bucket_cache *
bucket_load(heap_t *heap, int bucket_id)
{
int i;
struct bucket_cache *cache;
cache = get_cache(heap, bucket_id);
if(cache != NULL) {
return cache;
}
cache = get_cache_free();
if(cache == NULL) {
invalidate_cache();
cache = get_cache_free();
if(cache == NULL) {
return NULL;
}
}
if(bucket_read(heap, bucket_id, &cache->bucket) == 0) {
return NULL;
}
cache->heap = heap;
cache->bucket_id = bucket_id;
if(heap->next_free_slot[bucket_id] == 0) {
for(i = 0; i < BUCKET_SIZE; i++) {
if(EMPTY_PAIR(&cache->bucket.pairs[i])) {
break;
}
}
heap->next_free_slot[bucket_id] = i;
}
PRINTF("DB: Loaded bucket %d, the next free slot is %u\n", bucket_id,
(unsigned)heap->next_free_slot[bucket_id]);
return cache;
}
static int
bucket_append(heap_t *heap, int bucket_id, struct key_value_pair *pair)
{
unsigned long offset;
if(heap->next_free_slot[bucket_id] >= BUCKET_SIZE) {
PRINTF("DB: Invalid write attempt to the full bucket %d\n", bucket_id);
return 0;
}
offset = (unsigned long)bucket_id * sizeof(bucket_t);
offset += heap->next_free_slot[bucket_id] * sizeof(struct key_value_pair);
if(DB_ERROR(storage_write(heap->bucket_storage, pair, offset, sizeof(*pair)))) {
return 0;
}
heap->next_free_slot[bucket_id]++;
return 1;
}
static int
bucket_split(heap_t *heap, int bucket_id)
{
heap_node_t node;
maxheap_key_t mean;
int small_bucket_index;
int large_bucket_index;
if(heap_read(heap, bucket_id, &node) == 0) {
return 0;
}
mean = node.min + ((node.max - node.min) / 2);
PRINTF("DB: Split bucket %d (%ld, %ld) at mean value %ld\n", bucket_id,
(long)node.min, (long)node.max, (long)mean);
small_bucket_index = heap_insert(heap, node.min, mean);
if(small_bucket_index < 0) {
return 0;
}
large_bucket_index = heap_insert(heap, mean + 1, node.max);
if(large_bucket_index < 0) {
/*heap_remove(small_bucket);*/
return 0;
}
return 1;
}
int
insert_item(heap_t *heap, maxheap_key_t key, maxheap_value_t value)
{
int heap_iterator;
int bucket_id, last_good_bucket_id;
struct key_value_pair pair;
for(heap_iterator = 0, last_good_bucket_id = -1;;) {
bucket_id = heap_find(heap, key, &heap_iterator);
if(bucket_id < 0) {
break;
}
last_good_bucket_id = bucket_id;
}
bucket_id = last_good_bucket_id;
if(bucket_id < 0) {
PRINTF("DB: No bucket for key %ld\n", (long)key);
return 0;
}
pair.key = key;
pair.value = value;
if(heap->next_free_slot[bucket_id] == BUCKET_SIZE) {
PRINTF("DB: Bucket %d is full\n", bucket_id);
if(bucket_split(heap, bucket_id) == 0) {
return 0;
}
/* Select one of the newly created buckets. */
bucket_id = heap_find(heap, key, &heap_iterator);
if(bucket_id < 0) {
return 0;
}
}
if(bucket_append(heap, bucket_id, &pair) == 0) {
return 0;
}
PRINTF("DB: Inserted key %ld (hash %ld) into the heap at bucket_id %d\n",
(long)key, (long)transform_key(key), bucket_id);
return 1;
}
static db_result_t
create(index_t *index)
{
char heap_filename[DB_MAX_FILENAME_LENGTH];
char bucket_filename[DB_MAX_FILENAME_LENGTH];
char *filename;
db_result_t result;
heap_t *heap;
heap = NULL;
filename = NULL;
bucket_filename[0] = '\0';
/* Generate the heap file, which is the main index file that is
referenced from the metadata of the relation. */
filename = storage_generate_file("heap",
(unsigned long)NODE_LIMIT * sizeof(heap_node_t));
if(filename == NULL) {
PRINTF("DB: Failed to generate a heap file\n");
return DB_INDEX_ERROR;
}
memcpy(index->descriptor_file, filename,
sizeof(index->descriptor_file));
PRINTF("DB: Generated the heap file \"%s\" using %lu bytes of space\n",
index->descriptor_file, (unsigned long)NODE_LIMIT * sizeof(heap_node_t));
index->opaque_data = heap = memb_alloc(&heaps);
if(heap == NULL) {
PRINTF("DB: Failed to allocate a heap\n");
result = DB_ALLOCATION_ERROR;
goto end;
}
heap->heap_storage = -1;
heap->bucket_storage = -1;
/* Generate the bucket file, which stores the (key, value) pairs. */
filename = storage_generate_file("bucket",
(unsigned long)NODE_LIMIT * sizeof(bucket_t));
if(filename == NULL) {
PRINTF("DB: Failed to generate a bucket file\n");
result = DB_INDEX_ERROR;
goto end;
}
memcpy(bucket_filename, filename, sizeof(bucket_filename));
PRINTF("DB: Generated the bucket file \"%s\" using %lu bytes of space\n",
bucket_filename, (unsigned long)NODE_LIMIT * sizeof(bucket_t));
/* Initialize the heap. */
memset(&heap->next_free_slot, 0, sizeof(heap->next_free_slot));
heap->heap_storage = storage_open(index->descriptor_file);
heap->bucket_storage = storage_open(bucket_filename);
if(heap->heap_storage < 0 || heap->bucket_storage < 0) {
result = DB_STORAGE_ERROR;
goto end;
}
if(DB_ERROR(storage_write(heap->heap_storage, &bucket_filename, 0,
sizeof(bucket_filename)))) {
result = DB_STORAGE_ERROR;
goto end;
}
if(heap_insert(heap, KEY_MIN, KEY_MAX) < 0) {
PRINTF("DB: Heap insertion error\n");
result = DB_INDEX_ERROR;
goto end;
}
PRINTF("DB: Created a heap index\n");
result = DB_OK;
end:
if(result != DB_OK) {
if(heap != NULL) {
storage_close(heap->bucket_storage);
storage_close(heap->heap_storage);
memb_free(&heaps, heap);
}
if(index->descriptor_file[0] != '\0') {
cfs_remove(heap_filename);
index->descriptor_file[0] = '\0';
}
if(bucket_filename[0] != '\0') {
cfs_remove(bucket_filename);
}
}
return result;
}
static db_result_t
destroy(index_t *index)
{
release(index);
return DB_INDEX_ERROR;
}
static db_result_t
load(index_t *index)
{
heap_t *heap;
db_storage_id_t fd;
char bucket_file[DB_MAX_FILENAME_LENGTH];
index->opaque_data = heap = memb_alloc(&heaps);
if(heap == NULL) {
PRINTF("DB: Failed to allocate a heap\n");
return DB_ALLOCATION_ERROR;
}
fd = storage_open(index->descriptor_file);
if(fd < 0) {
return DB_STORAGE_ERROR;
}
if(storage_read(fd, bucket_file, 0, sizeof(bucket_file)) !=
sizeof(bucket_file)) {
storage_close(fd);
return DB_STORAGE_ERROR;
}
storage_close(fd);
heap->heap_storage = storage_open(index->descriptor_file);
heap->bucket_storage = storage_open(bucket_file);
memset(&heap->next_free_slot, 0, sizeof(heap->next_free_slot));
PRINTF("DB: Loaded max-heap index from file %s and bucket file %s\n",
index->descriptor_file, bucket_file);
return DB_OK;
}
static db_result_t
release(index_t *index)
{
heap_t *heap;
heap = index->opaque_data;
storage_close(heap->bucket_storage);
storage_close(heap->heap_storage);
memb_free(&heaps, index->opaque_data);
return DB_INDEX_ERROR;
}
static db_result_t
insert(index_t *index, attribute_value_t *key, tuple_id_t value)
{
heap_t *heap;
long long_key;
heap = (heap_t *)index->opaque_data;
long_key = db_value_to_long(key);
if(insert_item(heap, (maxheap_key_t)long_key,
(maxheap_value_t)value) == 0) {
PRINTF("DB: Failed to insert key %ld into a max-heap index\n", long_key);
return DB_INDEX_ERROR;
}
return DB_OK;
}
static db_result_t
delete(index_t *index, attribute_value_t *value)
{
return DB_INDEX_ERROR;
}
static tuple_id_t
get_next(index_iterator_t *iterator)
{
struct iteration_cache {
index_iterator_t *index_iterator;
int heap_iterator;
tuple_id_t found_items;
uint8_t start;
int visited_buckets[NODE_DEPTH];
int end;
};
static struct iteration_cache cache;
heap_t *heap;
maxheap_key_t key;
int bucket_id;
int tmp_heap_iterator;
int i;
struct bucket_cache *bcache;
uint8_t next_free_slot;
heap = (heap_t *)iterator->index->opaque_data;
key = *(maxheap_key_t *)&iterator->min_value;
if(cache.index_iterator != iterator || iterator->next_item_no == 0) {
/* Initialize the cache for a new search. */
cache.end = NODE_DEPTH - 1;
cache.found_items = cache.start = 0;
cache.index_iterator = iterator;
/* Find the downward path through the heap consisting of all nodes
that could possibly contain the key. */
for(i = tmp_heap_iterator = 0; i < NODE_DEPTH; i++) {
cache.visited_buckets[i] = heap_find(heap, key, &tmp_heap_iterator);
if(cache.visited_buckets[i] < 0) {
cache.end = i - 1;
break;
}
}
cache.heap_iterator = cache.end;
}
/*
* Search for the key in each heap node, starting from the bottom
* of the heap. Because the bottom nodes contain are very narrow
* range of keys, there is a much higher chance that the key will be
* there rather than at the top.
*/
for(; cache.heap_iterator >= 0; cache.heap_iterator--) {
bucket_id = cache.visited_buckets[cache.heap_iterator];
PRINTF("DB: Find key %lu in bucket %d\n", (unsigned long)key, bucket_id);
if((bcache = bucket_load(heap, bucket_id)) == NULL) {
PRINTF("DB: Failed to load bucket %d\n", bucket_id);
return INVALID_TUPLE;
}
/* Because keys are stored in an unsorted order in the bucket, we
* need to search the bucket sequentially. */
next_free_slot = heap->next_free_slot[bucket_id];
for(i = cache.start; i < next_free_slot; i++) {
if(bcache->bucket.pairs[i].key == key) {
if(cache.found_items++ == iterator->next_item_no) {
iterator->next_item_no++;
cache.start = i + 1;
PRINTF("DB: Found key %ld with value %lu\n", (long)key,
(unsigned long)bcache->bucket.pairs[i].value);
return (tuple_id_t)bcache->bucket.pairs[i].value;
}
}
}
}
if(VALUE_INT(&iterator->min_value) == VALUE_INT(&iterator->max_value)) {
PRINTF("DB: Could not find key %ld in the index\n", (long)key);
return INVALID_TUPLE;
}
iterator->next_item_no = 0;
VALUE_INT(&iterator->min_value)++;
return get_next(iterator);
}