mirror of
https://github.com/D4C1-Labs/Flipper-ARF.git
synced 2026-05-10 22:56:55 +00:00
256 lines
8.0 KiB
C
256 lines
8.0 KiB
C
#include <stdio.h>
|
|
#include <stdint.h>
|
|
|
|
#include "m-array.h"
|
|
#include "m-dict.h"
|
|
#include "m-string.h"
|
|
#include "m-algo.h"
|
|
|
|
/* Based on:
|
|
* Easy Perfect Minimal Hashing
|
|
* By Steve Hanov. Released to the public domain.
|
|
* http://stevehanov.ca/blog/index.php?id=119
|
|
*
|
|
* Based on:
|
|
* Edward A. Fox, Lenwood S. Heath, Qi Fan Chen and Amjad M. Daoud,
|
|
* "Practical minimal perfect hash functions for large databases", CACM, 35(1):105-121
|
|
*
|
|
* also a good reference:
|
|
* Compress, Hash, and Displace algorithm by Djamal Belazzougui,
|
|
* Fabiano C. Botelho, and Martin Dietzfelbinger
|
|
*/
|
|
|
|
|
|
|
|
/* Define the needed data structure : */
|
|
|
|
// Define a dictionnary of constant string --> integer and register it.
|
|
DICT_DEF2(dict_mph, const char *, M_CSTR_OPLIST, uint32_t, M_DEFAULT_OPLIST)
|
|
#define M_OPL_dict_mph_t() DICT_OPLIST(dict_mph, M_CSTR_OPLIST, M_DEFAULT_OPLIST)
|
|
|
|
// Define an array of integer to store the value and register it.
|
|
ARRAY_DEF(array_value, uint32_t) // Start from 1. 0 is 'None'
|
|
#define M_OPL_array_value_t() ARRAY_OPLIST(array_value)
|
|
|
|
// Define an array of integer to store the seed and register it.
|
|
ARRAY_DEF(array_seed, int32_t)
|
|
#define M_OPL_array_seed_t() ARRAY_OPLIST(array_seed)
|
|
// Define algorithms on array_seed
|
|
ALGO_DEF(array_seed, array_seed_t)
|
|
|
|
// Define an array of constant string that are ordered by their size.
|
|
ARRAY_DEF(array_cstr, const char *, M_CSTR_OPLIST)
|
|
// Register it globally but with a custom CMP operator
|
|
static inline int array_cstr_cmp(const array_cstr_t a, const array_cstr_t b) {
|
|
// We compare only the size
|
|
size_t sa = array_cstr_size(a);
|
|
size_t sb = array_cstr_size(b);
|
|
return (sa > sb) ? -1 : (sa < sb);
|
|
}
|
|
#define M_OPL_array_cstr_t() M_OPEXTEND(ARRAY_OPLIST(array_cstr, M_CSTR_OPLIST), CMP(array_cstr_cmp))
|
|
|
|
// Define an array of array of constant string, and some algo over this container
|
|
ARRAY_DEF(array_bucket, array_cstr_t)
|
|
#define M_OPL_array_bucket_t() ARRAY_OPLIST(array_bucket, M_OPL_array_cstr_t() )
|
|
ALGO_DEF(array_bucket, array_bucket_t)
|
|
|
|
// Define an array of string to save the input.
|
|
ARRAY_DEF(array_string, string_t)
|
|
#define M_OPL_array_string_t() ARRAY_OPLIST(array_string, M_OPL_string_t())
|
|
|
|
|
|
|
|
|
|
/* The algorithm may not converge on all inputs if the hash function
|
|
* doesn't randomly distribute the inputs:
|
|
* str1 != str2 ==> \exist seed \such_as hash(seed,str1) != hash(seed, str2)
|
|
* The first hash function below doesn't provide the needed property for
|
|
* inputs with two words 'a' and 'c'
|
|
* The second one is more robust but may still have this issue.
|
|
*/
|
|
static uint32_t hash(int32_t seed, const char str[])
|
|
{
|
|
#if 0
|
|
if (seed == 0) seed = 0x01000193;
|
|
while (*str) {
|
|
seed = ((seed * 0x01000193) ^ *str);
|
|
str++;
|
|
}
|
|
return seed;
|
|
#else
|
|
if (seed == 0) seed = 0x811C9DC5;
|
|
while (*str) {
|
|
seed = (seed ^ *str) * 16777619;
|
|
str++;
|
|
}
|
|
return seed ^ (seed >> 16);
|
|
#endif
|
|
}
|
|
|
|
// Step 1: place all key keys into buckets
|
|
static void
|
|
CreateMinimalPerfectHash1(array_seed_t seed, array_value_t value,
|
|
array_bucket_t bucket,
|
|
const dict_mph_t dict)
|
|
{
|
|
size_t size = dict_mph_size (dict);
|
|
|
|
array_value_resize(value, size);
|
|
array_seed_resize(seed, size);
|
|
array_bucket_resize(bucket, size);
|
|
|
|
for M_EACH(item, dict, dict_mph_t) {
|
|
size_t i = hash(0, item->key) % size;
|
|
array_cstr_push_back (*array_bucket_get (bucket, i), item->key);
|
|
}
|
|
}
|
|
|
|
// Step 2: Sort the buckets and process the ones with the most items first.
|
|
static void
|
|
CreateMinimalPerfectHash2(array_seed_t seed, array_value_t value,
|
|
array_bucket_t bucket,
|
|
const dict_mph_t dict)
|
|
{
|
|
size_t size = dict_mph_size (dict);
|
|
array_bucket_sort(bucket);
|
|
M_LET(slots, array_seed_t) {
|
|
for M_EACH(b, bucket, array_bucket_t) {
|
|
if (array_cstr_size(*b) <= 1)
|
|
break;
|
|
size_t d = 1;
|
|
/* Repeatedly try different values of d until we find a hash function
|
|
that places all items in the bucket into free slots.
|
|
This may loop forever on some inputs if the hash function is not good
|
|
enought. */
|
|
for(bool cont = true; cont ; ) {
|
|
cont = false;
|
|
array_seed_reset(slots);
|
|
for M_EACH(it, *b, array_cstr_t) {
|
|
size_t s = hash(d, *it) % size;
|
|
if (*array_value_get(value, s) != 0 || array_seed_contain_p(slots, s)){
|
|
d++;
|
|
cont = true;
|
|
break;
|
|
}
|
|
array_seed_push_back (slots, s);
|
|
}
|
|
}
|
|
// Fill in the seed & value with the computed value
|
|
array_seed_set_at(seed, hash(0, *array_cstr_get(*b, 0)) % size, d);
|
|
d = 0;
|
|
for M_EACH(it, *b, array_cstr_t) {
|
|
array_value_set_at(value,
|
|
*array_seed_get(slots, d),
|
|
*dict_mph_get(dict, *it));
|
|
d++;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Step 3:
|
|
* Only buckets with 1 item remain. Process them more quickly by directly
|
|
* placing them into a free slot. Use a negative value of d to indicate
|
|
* this.
|
|
*/
|
|
static void
|
|
CreateMinimalPerfectHash3(array_seed_t seed, array_value_t value,
|
|
array_bucket_t bucket,
|
|
const dict_mph_t dict)
|
|
{
|
|
size_t size = dict_mph_size (dict);
|
|
|
|
M_LET(freelist, array_seed_t) {
|
|
// Get a list of available slot
|
|
for(size_t i = 0; i < size; i++)
|
|
if (*array_value_get(value, i) == 0)
|
|
array_seed_push_back(freelist, i);
|
|
for M_EACH(b, bucket, array_bucket_t) {
|
|
if (array_cstr_size(*b) != 1)
|
|
continue;
|
|
int32_t slot;
|
|
array_seed_pop_back(&slot, freelist);
|
|
/*We subtract one to ensure it's negative even if the zeroeth slot was
|
|
used. */
|
|
array_seed_set_at (seed, hash(0, *array_cstr_get(*b, 0)) % size,
|
|
-slot-1);
|
|
array_value_set_at(value, slot, *dict_mph_get(dict, *array_cstr_get(*b, 0)));
|
|
}
|
|
}
|
|
}
|
|
|
|
static void
|
|
CreateMinimalPerfectHash(array_seed_t seed, array_value_t value,
|
|
const dict_mph_t dict)
|
|
{
|
|
M_LET(bucket, array_bucket_t) {
|
|
CreateMinimalPerfectHash1(seed, value, bucket, dict);
|
|
CreateMinimalPerfectHash2(seed, value, bucket, dict);
|
|
CreateMinimalPerfectHash3(seed, value, bucket, dict);
|
|
}
|
|
}
|
|
|
|
static void
|
|
dict_read_from_file(dict_mph_t dict, array_string_t arr, const char filename[])
|
|
{
|
|
FILE *f = fopen(filename, "rt");
|
|
if (!f) {
|
|
fprintf(stderr, "ERROR: Cannot open %s\n", filename);
|
|
exit(2);
|
|
}
|
|
|
|
uint32_t line = 1;
|
|
M_LET(str, string_t) {
|
|
while (!feof(f) && !ferror(f)) {
|
|
string_fgets(str, f, STRING_READ_PURE_LINE);
|
|
string_strim(str);
|
|
if (!string_empty_p(str)) {
|
|
// Not clean as items of 'dict' points are linked to items of 'arr'
|
|
// and the containers are not aware of this link.
|
|
array_string_push_back (arr, str);
|
|
dict_mph_set_at(dict, string_get_cstr(*array_string_back(arr)), line);
|
|
line++;
|
|
}
|
|
}
|
|
}
|
|
fclose(f);
|
|
}
|
|
|
|
static uint32_t
|
|
perfect_hash_lockup(array_seed_t seed, array_value_t value,
|
|
const char key[])
|
|
{
|
|
int32_t s = *array_seed_get(seed, hash(0, key) % array_seed_size(seed));
|
|
if (s < 0)
|
|
return *array_value_get(value, -s-1);
|
|
else
|
|
return *array_value_get(value, hash(s, key) % array_value_size(value));
|
|
}
|
|
|
|
static void
|
|
test(array_seed_t seed, array_value_t value, dict_mph_t dict)
|
|
{
|
|
for M_EACH(item, dict, dict_mph_t) {
|
|
const char *key = item->key;
|
|
uint32_t v = perfect_hash_lockup(seed, value, key);
|
|
if (v != item->value) {
|
|
fprintf(stderr, "ERROR for %s: %u VS %u\n", key, v, item->value);
|
|
}
|
|
}
|
|
}
|
|
|
|
int main(int argc, const char *argv[])
|
|
{
|
|
// Create arr as array_string_t, dict as dist_mph_t and seed as array_seed_t
|
|
M_LET(arr, array_string_t) M_LET(dict, dict_mph_t) M_LET(seed, array_seed_t)
|
|
// and value as array_value_t
|
|
M_LET(value, array_value_t) {
|
|
const char *filename = (argc > 1) ? argv[1] :"/usr/share/dict/words";
|
|
dict_read_from_file(dict, arr, filename);
|
|
CreateMinimalPerfectHash(seed, value, dict);
|
|
test(seed, value, dict);
|
|
}
|
|
|
|
return 0;
|
|
}
|