Inbloom 是跨语言的 Bloom filter 实现。Bloom filter 是由 Howard Bloom 在 1970
年提出的二进制向量数据结构,它具有很好的空间和时间效率,被用来检测一个元素是不是集合中的一个成员。如果检测结果为是,该元素不一定在集合中;但如果
检测结果为否,该元素一定不在集合中。因此Bloom
filter具有100%的召回率。这样每个检测请求返回有“在集合内(可能错误)”和“不在集合内(绝对不在集合内)”两种情况,可见 Bloom
filter 是牺牲了正确率和时间以节省空间。
![image](http://static.oschina.net/uploads/space/2015/0731/112657_Miz7_2306979.png)
示例代码:
Python:
<pre class="brush:python;toolbar: true; auto-links: false;">import inbloom
import base64
import requests
# Basic usage
bf = inbloom.Filter(entries=100, error=0.01)
bf.add("abc")
bf.add("def")
assert bf.contains("abc")
assert bf.contains("def")
assert not bf.contains("ghi")
bf2 = inbloom.Filter(entries=100, error=0.01, data=bf.buffer())
assert bf2.contains("abc")
assert bf2.contains("def")
assert not bf2.contains("ghi")
# Serialization
payload = 'Yg0AZAAAABQAAAAAACAAEAAIAAAAAAAAIAAQAAgABAA='
assert base64.b64encode(inbloom.dump(inbloom.load(base64.b64decode(payload)))) == payload
# Sending it over HTTP
serialized = base64.b64encode(inbloom.dump(bf))
requests.get('http://api.endpoint.me', params={'filter': serialized})</pre>
Go:
<pre class="brush:cpp ;toolbar: true; auto-links: false;">// create a blank filter - expecting 20 members and an error rate of 1/100
f, err := NewFilter(20, 0.01)
if err != nil {
panic(err)
}
// the size of the filter
fmt.Println(f.Len())
// insert some values
f.Add("foo")
f.Add("bar")
// test for existence of keys
fmt.Println(f.Contains("foo"))
fmt.Println(f.Contains("wat"))
fmt.Println("marshaled data:", f.MarshalBase64())
// Output:
// 24
// true
// false
// marshaled data: oU4AZAAAABQAAAAAAEIAABEAGAQAAgAgAAAwEAAJAAA=</pre>
<pre class="brush:cpp ;toolbar: true; auto-links: false;">// a 20 cardinality 0.01 precision filter with "foo" and "bar" in it
data := "oU4AZAAAABQAAAAAAEIAABEAGAQAAgAgAAAwEAAJAAA="
// load it from base64
f, err := UnmarshalBase64(data)
if err != nil {
panic(err)
}
// test it...
fmt.Println(f.Contains("foo"))
fmt.Println(f.Contains("wat"))
fmt.Println(f.Len())
// dump to pure binary
fmt.Printf("%x\n", f.Marshal())
// Output:
// true
// false
// 24
// a14e006400000014000000000042000011001804000200200000301000090000</pre>
Java:
<pre class="brush:java;toolbar: true; auto-links: false;">// The basics
BloomFilter bf = new BloomFilter(20, 0.01);
bf.add("foo");
bf.add("bar");
assertTrue(bf.contains("foo"));
assertTrue(bf.contains("bar"));
assertFalse(bf.contains("baz"));
BloomFilter bf2 = new BloomFilter(bf.bf, bf.entries, bf.error);
assertTrue(bf2.contains("foo"));
assertTrue(bf2.contains("bar"));
assertFalse(bf2.contains("baz"));
// Serialization
String serialized = BinAscii.hexlify(BloomFilter.dump(bf));
System.out.printf("Serialized: %s\n", serialized);
String hexPayload = "620d006400000014000000000020001000080000000000002000100008000400";
BloomFilter deserialized = BloomFilter.load(BinAscii.unhexlify(hexPayload));
String dump = BinAscii.hexlify(BloomFilter.dump(deserialized));
System.out.printf("Re-Serialized: %s\n", dump);
assertEquals(dump.toLowerCase(), hexPayload);
assertEquals(deserialized.entries, 20);
assertEquals(deserialized.error, 0.01);
assertTrue(deserialized.contains("abc"));</pre>
- 授权协议:
- BSD
- 开发语言:
- JavaGoogle Go 查看源码»
- 操作系统:
- 跨平台