-
Notifications
You must be signed in to change notification settings - Fork 4
/
filter.go
101 lines (88 loc) · 3.39 KB
/
filter.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
package bloom
import (
"github.com/bits-and-blooms/bitset"
"github.com/spaolacci/murmur3"
"go-lsm/kv"
"math"
"unsafe"
)
const uin8Size = int(unsafe.Sizeof(uint8(0)))
const FalsePositiveRate = 0.01
// Filter represents Bloom filter.
// Bloom filter is a probabilistic data structure used to test whether an element maybe present in the dataset.
// A bloom filter can query against large amounts of data and return either “possibly in the set” or “definitely not in the set”.
// It depends on M-sized bit vector and K-hash functions.
type Filter struct {
numberOfHashFunctions uint8
falsePositiveRate float64
bitVector *bitset.BitSet
}
// DecodeToBloomFilter decodes the byte slice to the bloom filter.
// It relies on bitset.BitSet for decoding.
func DecodeToBloomFilter(buffer []byte, falsePositiveRate float64) (Filter, error) {
bitVector := new(bitset.BitSet)
filter := buffer[:len(buffer)-uin8Size]
if err := bitVector.UnmarshalBinary(filter); err != nil {
return Filter{}, err
}
return Filter{
numberOfHashFunctions: numberOfHashFunctions(falsePositiveRate),
falsePositiveRate: falsePositiveRate,
bitVector: bitVector,
}, nil
}
// Encode encodes the filter to a byte slice.
// It relies on bitset.BitSet for encoding. The encoded format is:
/*
------------------------------------------------
| bit vector | 1 byte for numberOfHashFunctions |
------------------------------------------------
*/
func (filter Filter) Encode() ([]byte, error) {
buffer, err := filter.bitVector.MarshalBinary()
if err != nil {
return nil, err
}
return append(buffer, filter.numberOfHashFunctions), nil
}
// add adds the given key in the bloom filter by setting the positions (/indices) of the key in the bit vector.
func (filter Filter) add(key kv.Key) {
positions := filter.bitPositionsFor(key)
for index := 0; index < len(positions); index++ {
position := positions[index]
filter.bitVector.Set(uint(position))
}
}
// MayContain returns true if all the bits identified by the positions (/indices) for the key are add.
// True indicates that the key MAYBE present in the system.
// Returns false, if any of the bits identified by the positions (/indices) for the key are not add.
// False indicates that the key is definitely NOT present in the system.
func (filter Filter) MayContain(key kv.Key) bool {
positions := filter.bitPositionsFor(key)
for index := 0; index < len(positions); index++ {
position := positions[index]
if !filter.bitVector.Test(uint(position)) {
return false
}
}
return true
}
// bitPositionsFor returns the bit vector positions (/indices) for the key which must either be added or checked.
func (filter Filter) bitPositionsFor(key kv.Key) []uint32 {
indices := make([]uint32, 0, filter.numberOfHashFunctions)
for index := uint8(0); index < filter.numberOfHashFunctions; index++ {
hash := murmur3.Sum32WithSeed(key.RawBytes(), uint32(index))
indices = append(indices, hash%uint32(filter.bitVector.Len()))
}
return indices
}
// numberOfHashFunctions returns the number of hash functions.
func numberOfHashFunctions(falsePositiveRate float64) uint8 {
return uint8(math.Ceil(math.Log2(1.0 / falsePositiveRate)))
}
// bitVectorSize returns the bit vector size.
func bitVectorSize(capacity int, falsePositiveRate float64) int {
//ln22 = ln2^2
ln22 := math.Pow(math.Ln2, 2)
return int(float64(capacity) * math.Abs(math.Log(falsePositiveRate)) / ln22)
}