Files
mercury/library/hash_table.m
Simon Taylor 121a5e65eb Don't make the predicate version of hash_table__search
Estimated hours taken: 0.1
Branches: main

NEWS:
library/hash_table.m:
	Don't make the predicate version of hash_table__search
	obsolete. Fergus and Ralph prefer to use predicates
	instead of semidet functions.
2003-03-20 22:51:37 +00:00

603 lines
20 KiB
Mathematica

%-----------------------------------------------------------------------------%
% Copyright (C) 2001, 2003 The University of Melbourne
% This file may only be copied under the terms of the GNU Library General
% Public License - see the file COPYING.LIB in the Mercury distribution.
%-----------------------------------------------------------------------------%
% vim: ts=4 sw=4 et tw=0 wm=0 ft=mercury
%-----------------------------------------------------------------------------%
%
% File: hash_table.m
% Main author: rafe
% Stability: low
%
% Hash table implementation.
%
% This implementation uses double hashing and requires the user to
% supply a predicate that will compute two independent hash values
% for any given key.
%
% Default double-hash functions are provided for ints, strings and
% generic values.
%
% The number of buckets in the hash table is always a power of 2.
%
% When a user set occupancy level is achieved, the number of buckets
% in the table is doubled and the previous contents reinserted into
% the new hash table.
%
% CAVEAT: the user is referred to the warning at the head of array.m
% with regard to the current use of unique objects. Briefly, the
% problem is that the compiler does not yet properly understand
% unique modes, hence we fake it using non-unique modes.
% This means that care must be taken not to use an old version of a
% destructively updated structure (such as a hash_table) since the
% compiler will not currently detect such errors.
%
%-----------------------------------------------------------------------------%
:- module hash_table.
:- interface.
:- import_module int, assoc_list, float, string, array, bitmap, char.
:- type hash_table(K, V).
% XXX This is all fake until the compiler can handle nested unique modes.
%
:- inst hash_table ==
bound(ht(ground, ground, ground, hash_pred, bitmap, array, array)).
:- mode hash_table_ui == in(hash_table).
:- mode hash_table_di == di(hash_table).
:- mode hash_table_uo == out(hash_table).
:- type hash_pred(K) == ( pred(K, int, int) ).
:- inst hash_pred == ( pred(in, out, out) is det ).
% new(HashPred, N, MaxOccupancy)
% constructs a new hash table with initial size 2 ^ N that is
% doubled whenever MaxOccupancy is achieved; elements are
% indexed using HashPred.
%
% HashPred must compute two *independent* hashes for a given
% key - that is, one hash should not be a function of the other.
% Otherwise poor performance will result.
%
% N must be greater than 1.
% MaxOccupancy must be in (0.0, 1.0).
%
% XXX Values too close to the limits may cause bad things
% to happen.
%
:- func new(hash_pred(K), int, float) = hash_table(K, V).
:- mode new(in(hash_pred), in, in) = hash_table_uo is det.
% new_default(HashFn) constructs a hash table with default size and
% occupancy arguments.
%
:- func new_default(hash_pred(K)) = hash_table(K, V).
:- mode new_default(in(hash_pred)) = hash_table_uo is det.
% Retrieve the hash_pred associated with a hash table.
%
:- func hash_pred(hash_table(K, V)) = hash_pred(K).
:- mode hash_pred(hash_table_ui) = out(hash_pred) is det.
% Default hash_preds for ints and strings and everything (buwahahaha!)
%
:- pred int_double_hash(int, int, int).
:- mode int_double_hash(in, out, out) is det.
:- pred string_double_hash(string, int, int).
:- mode string_double_hash(in, out, out) is det.
:- pred char_double_hash(char, int, int).
:- mode char_double_hash(in, out, out) is det.
:- pred float_double_hash(float, int, int).
:- mode float_double_hash(in, out, out) is det.
:- pred generic_double_hash(T, int, int).
:- mode generic_double_hash(in, out, out) is det.
% Returns the number of buckets in a hash table.
%
:- func num_buckets(hash_table(K, V)) = int.
:- mode num_buckets(hash_table_ui) = out is det.
%:- mode num_buckets(in) = out is det.
% Returns the number of occupants in a hash table.
%
:- func num_occupants(hash_table(K, V)) = int.
:- mode num_occupants(hash_table_ui) = out is det.
%:- mode num_occupants(in) = out is det.
% Insert key-value binding into a hash table; if one is
% already there then the previous value is overwritten.
%
:- func set(hash_table(K, V), K, V) = hash_table(K, V).
:- mode set(hash_table_di, in, in) = hash_table_uo is det.
% Field update for hash tables.
% HT ^ elem(K) := V is equivalent to set(HT, K, V).
%
:- func 'elem :='(K, hash_table(K, V), V) = hash_table(K, V).
:- mode 'elem :='(in, hash_table_di, in) = hash_table_uo is det.
% Insert a key-value binding into a hash table. An
% exception is thrown if a binding for the key is already
% present.
%
:- func det_insert(hash_table(K, V), K, V) = hash_table(K, V).
:- mode det_insert(hash_table_di, in, in) = hash_table_uo is det.
% Change a key-value binding in a hash table. An
% exception is thrown if a binding for the key does not
% already exist.
%
:- func det_update(hash_table(K, V), K, V) = hash_table(K, V).
:- mode det_update(hash_table_di, in, in) = hash_table_uo is det.
% Delete the entry for the given key, leaving the hash table
% unchanged if there is no such entry.
%
:- func delete(hash_table(K, V), K) = hash_table(K, V).
:- mode delete(hash_table_di, in) = hash_table_uo is det.
% Lookup the value associated with the given key. An exception
% is raised if there is no entry for the key.
%
:- func lookup(hash_table(K, V), K) = V.
:- mode lookup(hash_table_ui, in) = out is det.
%:- mode lookup(in, in) = out is det.
% Field access for hash tables.
% HT ^ elem(K) is equivalent to lookup(HT, K).
%
:- func elem(K, hash_table(K, V)) = V.
:- mode elem(in, hash_table_ui) = out is det.
%:- mode elem(in, in) = out is det.
% Like lookup, but just fails if there is no entry for the key.
%
:- func search(hash_table(K, V), K) = V.
:- mode search(hash_table_ui, in) = out is semidet.
%:- mode search(in, in, out) is semidet.
:- pred search(hash_table(K, V), K, V).
:- mode search(hash_table_ui, in, out) is semidet.
%:- mode search(in, in, out) is semidet.
% Convert a hash table into an association list.
%
:- func to_assoc_list(hash_table(K, V)) = assoc_list(K, V).
:- mode to_assoc_list(hash_table_ui) = out is det.
%:- mode to_assoc_list(in) = out is det.
% Fold a function over the key-value bindings in a hash table.
%
:- func fold(func(K, V, T) = T, hash_table(K, V), T) = T.
:- mode fold(func(in, in, in) = out is det, hash_table_ui, in) = out is det.
:- mode fold(func(in, in, di) = uo is det, hash_table_ui, di) = uo is det.
%-----------------------------------------------------------------------------%
%-----------------------------------------------------------------------------%
:- implementation.
:- import_module math, bool, exception, list, require, std_util.
:- type hash_table(K, V)
---> ht(
num_buckets :: int,
num_occupants :: int,
max_occupants :: int,
hash_pred :: hash_pred(K),
bitmap :: bitmap,
keys :: array(K),
values :: array(V)
).
% ---------------------------------------------------------------------------- %
% THE HASHING SCHEME
%
% The user provided hashing function computes two independent
% hashes H1 and H2 for a given key K.
%
% We calculate D = 2*H2 + 1 (to ensure that D is non-zero and
% odd and is therefore coprime to the number of buckets) and
% probe the table where the Kth probe examines slot
%
% (H1 + K * H2) `mod` num_buckets
%
% The search is guaranteed to terminate because table occupancy
% must be less than 1.0.
%
% The bitmap keeps track of which slots are occupied.
%
% If a slot is occupied, then the corresponding element in
% the keys array is used to decide whether this probe has
% found the right slot.
%
% Once the right slot has been found, the value can be found
% in the corresponding element in the values array.
%
% The keys and values arrays cannot be initialised until at
% least one binding is inserted. This is handled by checking
% to see whether the keys array has non-zero size.
% ---------------------------------------------------------------------------- %
new(HashPred, N, MaxOccupancy) = HT :-
( if N =< 1 then
throw(software_error("hash_table__new_hash_table: N =< 1"))
else if N >= int__bits_per_int then
throw(software_error(
"hash_table__new_hash_table: N >= int__bits_per_int"))
else if MaxOccupancy =< 0.0 ; 1.0 =< MaxOccupancy then
throw(software_error(
"hash_table__new_hash_table: MaxOccupancy not in (0.0, 1.0)"))
else
NumBuckets = 1 << N,
MaxOccupants = ceiling_to_int(float(NumBuckets) * MaxOccupancy),
Bitmap = bitmap__new(NumBuckets, no),
Keys = array__make_empty_array,
Values = array__make_empty_array,
HT = ht(NumBuckets, 0, MaxOccupants, HashPred, Bitmap, Keys, Values)
).
% ---------------------------------------------------------------------------- %
% These numbers are picked out of thin air.
%
new_default(HashPred) = new(HashPred, 7, 0.9).
% ---------------------------------------------------------------------------- %
% find_slot(HT, K) looks up key K in hash table HT and
% returns the index for the entry K in H. This is either the
% first free slot identified (K is not in the table, but here
% is where it would go) or the slot for K (K is in the table
% and this is its slot).
%
% Whether or not K is actually in the table can be decided
% by checking to see whether its bit in the bitmap is set
% or clear.
%
:- func find_slot(hash_table(K, V), K) = int.
:- mode find_slot(hash_table_ui, in) = out is det.
%:- mode find_slot(in, in) = out is det.
find_slot(HT, K) = H :-
(HT ^ hash_pred)(K, Hash1, Hash2),
H0 = Hash1 mod HT ^ num_buckets,
% Have to ensure it's odd and non-zero.
Delta = Hash2 + Hash2 + 1,
H = find_slot_2(HT, K, H0, Delta).
:- func find_slot_2(hash_table(K, V), K, int, int) = int.
:- mode find_slot_2(hash_table_ui, in, in, in) = out is det.
%:- mode find_slot_2(in, in, in, in) = out is det.
find_slot_2(HT, K, H0, Delta) = H :-
( if bitmap__is_clear(HT ^ bitmap, H0) then
H = H0
else if HT ^ keys ^ elem(H0) = K then
H = H0
else
H1 = (H0 + Delta) mod HT ^ num_buckets,
H = find_slot_2(HT, K, H1, Delta)
).
% ---------------------------------------------------------------------------- %
set(HT0, K, V) = HT :-
% If this is the first entry in the hash table, then we use it to
% set up the hash table (the arrays are currently empty because we
% need values to initialise them with).
%
( if array__size(HT0 ^ values) = 0 then
HT = set(
(( HT0
^ keys := array__init(HT0 ^ num_buckets, K) )
^ values := array__init(HT0 ^ num_buckets, V) ),
K, V
)
else
H = find_slot(HT0, K),
( if bitmap__is_set(HT0 ^ bitmap, H) then
HT = ( HT0 ^ values ^ elem(H) := V )
else
HT =
( if HT0 ^ num_occupants = HT0 ^ max_occupants then
set(expand(HT0), K, V)
else
(((( HT0
^ num_occupants := HT0 ^ num_occupants + 1 )
^ bitmap := bitmap__set(HT0 ^ bitmap, H) )
^ keys ^ elem(H) := K )
^ values ^ elem(H) := V )
)
)
).
'elem :='(K, HT, V) = set(HT, K, V).
% ---------------------------------------------------------------------------- %
search(HT, K, search(HT, K)).
search(HT, K) = V :-
H = find_slot(HT, K),
bitmap__is_set(HT ^ bitmap, H),
V = HT ^ values ^ elem(H).
% ---------------------------------------------------------------------------- %
det_insert(HT, K, V) =
( if bitmap__is_set(HT ^ bitmap, H) then
throw(software_error("hash_table__det_insert: key already present"))
else if HT ^ num_occupants = HT ^ max_occupants then
set(expand(HT), K, V)
else if array__size(HT ^ values) = 0 then
% If this is the first entry in the hash table, then we use it to
% set up the hash table (the arrays are currently empty because we
% need values to initialise them with).
%
set(HT, K, V)
else
(((( HT
^ num_occupants := HT ^ num_occupants + 1 )
^ bitmap := bitmap__set(HT ^ bitmap, H) )
^ keys ^ elem(H) := K )
^ values ^ elem(H) := V )
)
:-
H = find_slot(HT, K).
% ---------------------------------------------------------------------------- %
det_update(HT, K, V) =
( if bitmap__is_clear(HT ^ bitmap, H) then
throw(software_error("hash_table__det_update: key not found"))
else
HT ^ values ^ elem(H) := V
)
:-
H = find_slot(HT, K).
% ---------------------------------------------------------------------------- %
lookup(HT, K) =
( if V = search(HT, K)
then V
else throw(software_error("hash_table__lookup: key not found"))
).
elem(K, HT) = lookup(HT, K).
% ---------------------------------------------------------------------------- %
delete(HT, K) =
HT ^ bitmap := bitmap__clear(HT ^ bitmap, find_slot(HT, K)).
% ---------------------------------------------------------------------------- %
to_assoc_list(HT) = to_assoc_list_2(0, HT, []).
:- func to_assoc_list_2(int,hash_table(K,V),assoc_list(K,V)) = assoc_list(K,V).
:- mode to_assoc_list_2(in, hash_table_ui, in) = out is det.
%:- mode to_assoc_list_2(in, in, in) = out is det.
to_assoc_list_2(I, HT, AList) =
( if I >= HT ^ num_buckets then
AList
else if bitmap__is_clear(HT ^ bitmap, I) then
to_assoc_list_2(I + 1, HT, AList)
else
to_assoc_list_2(I + 1, HT,
[(HT ^ keys ^ elem(I)) - (HT ^ values ^ elem(I)) | AList])
).
% ---------------------------------------------------------------------------- %
% Hash tables expand by doubling in size.
%
:- func expand(hash_table(K, V)) = hash_table(K, V).
:- mode expand(hash_table_di) = hash_table_uo is det.
expand(HT0) = HT :-
HT0 = ht(NBs0, _NOs, MOs0, HP, BM0, Ks0, Vs0),
NBs = NBs0 + NBs0,
MOs = MOs0 + MOs0,
BM = bitmap__new(NBs, no),
Ks = array__make_empty_array,
Vs = array__make_empty_array,
HT1 = ht(NBs, 0, MOs, HP, BM, Ks, Vs),
HT = reinsert_bindings(0, NBs0, BM0, Ks0, Vs0, HT1).
% ---------------------------------------------------------------------------- %
:- func reinsert_bindings(int, int, bitmap, array(K), array(V),
hash_table(K, V)) = hash_table(K, V).
:- mode reinsert_bindings(in, in, bitmap_ui, array_ui, array_ui,
hash_table_di) = hash_table_uo is det.
reinsert_bindings(I, NumBuckets, Bitmap, Keys, Values, HT) =
( if I >= NumBuckets then
HT
else if bitmap__is_clear(Bitmap, I) then
reinsert_bindings(I + 1, NumBuckets, Bitmap, Keys, Values, HT)
else
reinsert_bindings(I + 1, NumBuckets, Bitmap, Keys, Values,
set(HT, Keys ^ elem(I), Values ^ elem(I)))
).
% ---------------------------------------------------------------------------- %
% There are almost certainly better ones out there...
%
% NOTE that H1 \= N since neither of H1 or H2 should be a function
% of the other under machine integer arithmetic.
%
int_double_hash(N, H1, H2) :-
H1 = N * N,
H2 = N `xor` (N + N).
% ---------------------------------------------------------------------------- %
% There are almost certainly better ones out there...
%
string_double_hash(S, H1, H2) :-
H1 = string__hash(S),
H2 = string__foldl(func(C, N) = char__to_int(C) + N, S, 0).
%------------------------------------------------------------------------------%
% There are almost certainly better ones out there...
%
float_double_hash(F, H1, H2) :-
H1 = float__hash(F),
H2 = float__hash(F * F).
%------------------------------------------------------------------------------%
% There are almost certainly better ones out there...
%
char_double_hash(C, H1, H2) :-
int_double_hash(char__to_int(C), H1, H2).
% ---------------------------------------------------------------------------- %
% This, again, is straight off the top of my head.
%
generic_double_hash(T, Ha, Hb) :-
( if dynamic_cast(T, Int) then
int_double_hash(Int, Ha, Hb)
else if dynamic_cast(T, String) then
string_double_hash(String, Ha, Hb)
else if dynamic_cast(T, Float) then
float_double_hash(Float, Ha, Hb)
else if dynamic_cast(T, Char) then
char_double_hash(Char, Ha, Hb)
else if dynamic_cast(T, Univ) then
generic_double_hash(univ_value(Univ), Ha, Hb)
else if dynamic_cast_to_array(T, Array) then
{Ha, Hb} =
array__foldl(
( func(X, {HA0, HB0}) = {HA, HB} :-
generic_double_hash(X, HXA, HXB),
double_munge(HXA, HA0, HA, HXB, HB0, HB)
),
Array,
{0, 0}
)
else
deconstruct(T, FunctorName, Arity, Args),
string_double_hash(FunctorName, Ha0, Hb0),
double_munge(Arity, Ha0, Ha1, Arity, Hb0, Hb1),
list__foldl2(
( pred(U::in, HA0::in, HA::out, HB0::in, HB::out) is det :-
generic_double_hash(U, HUA, HUB),
double_munge(HUA, HA0, HA, HUB, HB0, HB)
),
Args,
Ha1, Ha,
Hb1, Hb
)
).
% ---------------------------------------------------------------------------- %
:- func munge_factor_a = int.
munge_factor_a = 5.
:- func munge_factor_b = int.
munge_factor_b = 3.
:- pred double_munge(int, int, int, int, int, int).
:- mode double_munge(in, in, out, in, in, out) is det.
double_munge(X, Ha0, Ha, Y, Hb0, Hb) :-
Ha = munge(munge_factor_a, Ha0, X),
Hb = munge(munge_factor_b, Hb0, Y).
:- func munge(int, int, int) = int.
munge(N, X, Y) =
(X `unchecked_left_shift` N) `xor`
(X `unchecked_right_shift` (int__bits_per_int - N)) `xor`
Y.
% ---------------------------------------------------------------------------- %
fold(Fn, HT, X) = fold_0(0, Fn, HT, X).
:- func fold_0(int, func(K, V, T) = T, hash_table(K,V), T) = T.
:- mode fold_0(in, func(in,in,in) = out is det, hash_table_ui, in) = out is det.
:- mode fold_0(in, func(in,in,di) = uo is det, hash_table_ui, di) = uo is det.
% :- mode fold_0(in, func(in,in,in) = out is det, in, in) = out is det.
% :- mode fold_0(in, func(in,in,di) = uo is det, in, di) = uo is det.
fold_0(I, Fn, HT, X) =
( if I >= HT ^ num_buckets then
X
else if bitmap__is_clear(HT ^ bitmap, I) then
fold_0(I + 1, Fn, HT, X)
else
fold_0(I + 1, Fn, HT, Fn(HT ^ keys ^ elem(I), HT ^ values ^ elem(I), X))
).
% ---------------------------------------------------------------------------- %
% XXX To go into array.m
%
% dynamic_cast/2 won't work for arbitrary arrays since array/1 is
% not a ground type (that is, dynamic_cast/2 will work when the
% target type is e.g. array(int), but not when it is array(T)).
%
:- some [T2] pred dynamic_cast_to_array(T1, array(T2)).
:- mode dynamic_cast_to_array(in, out) is semidet.
dynamic_cast_to_array(X, A) :-
% If X is an array then it has a type with one type argument.
%
[ArgTypeDesc] = type_args(type_of(X)),
% Convert ArgTypeDesc to a type variable ArgType.
%
(_ `with_type` ArgType) `has_type` ArgTypeDesc,
% Constrain the type of A to be array(ArgType) and do the
% cast.
%
dynamic_cast(X, A `with_type` array(ArgType)).
% ---------------------------------------------------------------------------- %
% ---------------------------------------------------------------------------- %